LogicalStructure

Sample Kotlin code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Android SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5package com.pdftron.android.pdfnetsdksamples.samples
6
7import com.pdftron.android.pdfnetsdksamples.OutputListener
8import com.pdftron.android.pdfnetsdksamples.PDFNetSample
9import com.pdftron.android.pdfnetsdksamples.R
10import com.pdftron.android.pdfnetsdksamples.util.Utils
11import com.pdftron.common.PDFNetException
12import com.pdftron.pdf.PDFDoc
13import com.pdftron.pdf.PageIterator
14import com.pdftron.pdf.struct.ContentItem
15import com.pdftron.pdf.struct.SElement
16import com.pdftron.pdf.struct.STree
17import com.pdftron.sdf.SDFDoc
18
19class LogicalStructureTest : PDFNetSample() {
20 override fun run(outputListener: OutputListener?) {
21 super.run(outputListener)
22 mOutputListener = outputListener
23 mFileList.clear()
24 printHeader(outputListener!!)
25 try // Extract logical structure from a PDF document
26 {
27 PDFDoc(Utils.getAssetTempFile(INPUT_PATH.toString() + "tagged.pdf")!!.getAbsolutePath()).use { doc ->
28 doc.initSecurityHandler()
29 mOutputListener!!.println("____________________________________________________________")
30 mOutputListener!!.println("Sample 1 - Traverse logical structure tree...")
31 run({
32 val tree: STree = doc.getStructTree()
33 if (tree.isValid()) {
34 mOutputListener!!.println("Document has a StructTree root.")
35 for (i in 0 until tree.getNumKids()) {
36 // Recursively get structure info for all all child elements.
37 ProcessStructElement(tree.getKid(i), 0)
38 }
39 } else {
40 mOutputListener!!.println("This document does not contain any logical structure.")
41 }
42 })
43 mOutputListener!!.println("\nDone 1.")
44 mOutputListener!!.println("____________________________________________________________")
45 mOutputListener!!.println("Sample 2 - Get parent logical structure elements from")
46 mOutputListener!!.println("layout elements.")
47 run({
48 val reader: com.pdftron.pdf.ElementReader = com.pdftron.pdf.ElementReader()
49 val itr: PageIterator = doc.getPageIterator()
50 while (itr.hasNext()) {
51 reader.begin(itr.next())
52 ProcessElements(reader)
53 reader.end()
54 }
55 })
56 mOutputListener!!.println("\nDone 2.")
57 mOutputListener!!.println("____________________________________________________________")
58 mOutputListener!!.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
59 run({
60
61 //A map which maps page numbers(as Integers)
62 //to page Maps(which map from struct mcid(as Integers) to
63 //text Strings)
64 val mcid_doc_map: MutableMap<Int, Map<Int, String>> = java.util.TreeMap<Int, Map<Int, String>>()
65 val reader: com.pdftron.pdf.ElementReader = com.pdftron.pdf.ElementReader()
66 val itr: PageIterator = doc.getPageIterator()
67 while (itr.hasNext()) {
68 val current: com.pdftron.pdf.Page? = itr.next()
69 reader.begin(current)
70 val page_mcid_map: MutableMap<Int, String> = java.util.TreeMap<Int, String>()
71 mcid_doc_map.put(current!!.getIndex(), page_mcid_map)
72 ProcessElements2(reader, page_mcid_map)
73 reader.end()
74 }
75 val tree: STree = doc.getStructTree()
76 if (tree.isValid()) {
77 for (i in 0 until tree.getNumKids()) {
78 ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0)
79 }
80 }
81 })
82 mOutputListener!!.println("\nDone 3.")
83 doc.save(Utils.createExternalFile("LogicalStructure.pdf", mFileList).getAbsolutePath(), SDFDoc.SaveMode.LINEARIZED, null)
84 }
85 } catch (e: java.lang.Exception) {
86 mOutputListener!!.printError(e.getStackTrace())
87 }
88 for (file in mFileList) {
89 addToFileList(file)
90 }
91 printFooter(outputListener)
92 }
93
94 companion object {
95 private var mOutputListener: OutputListener? = null
96 private val mFileList: java.util.ArrayList<String> = java.util.ArrayList<String>()
97 fun PrintIndent(indent: Int) {
98 mOutputListener!!.println()
99 for (i in 0 until indent) mOutputListener!!.print(" ")
100 }
101
102 // Used in code snippet 1.
103 @Throws(PDFNetException::class)
104 fun ProcessStructElement(element: SElement, indent: Int) {
105 var indent = indent
106 if (!element.isValid()) {
107 return
108 }
109
110 // Print out the type and title info, if any.
111 PrintIndent(indent++)
112 mOutputListener!!.print("Type: " + element.getType())
113 if (element.hasTitle()) {
114 mOutputListener!!.print(". Title: " + element.getTitle())
115 }
116 val num: Int = element.getNumKids()
117 for (i in 0 until num) {
118 // Check is the kid is a leaf node (i.e. it is a ContentItem).
119 if (element.isContentItem(i)) {
120 val cont: ContentItem = element.getAsContentItem(i)
121 val type: Int = cont.getType()
122 val page: com.pdftron.pdf.Page = cont.getPage()
123 PrintIndent(indent)
124 mOutputListener!!.print("Content Item. Part of page #" + page.getIndex())
125 PrintIndent(indent)
126 when (type) {
127 ContentItem.e_MCID, ContentItem.e_MCR -> mOutputListener!!.print("MCID: " + cont.getMCID())
128 ContentItem.e_OBJR -> {
129 mOutputListener!!.print("OBJR ")
130 val ref_obj: com.pdftron.sdf.Obj = cont.getRefObj()
131 if (ref_obj != null) mOutputListener!!.print("- Referenced Object#: " + ref_obj.getObjNum())
132 }
133 else -> {
134 }
135 }
136 } else { // the kid is another StructElement node.
137 ProcessStructElement(element.getAsStructElem(i), indent)
138 }
139 }
140 }
141
142 // Used in code snippet 2.
143 @Throws(PDFNetException::class)
144 fun ProcessElements(reader: com.pdftron.pdf.ElementReader) {
145 var element: com.pdftron.pdf.Element?
146 // Read page contents
147 while (true) {
148 element = reader.next()
149 if (element == null) {
150 break
151 }
152 // In this sample we process only paths & text, but the code can be
153 // extended to handle any element type.
154 val type: Int = element.getType()
155 if (type == com.pdftron.pdf.Element.e_path || type == com.pdftron.pdf.Element.e_text || type == com.pdftron.pdf.Element.e_path) {
156 when (type) {
157 com.pdftron.pdf.Element.e_path -> mOutputListener!!.print("\nPATH: ")
158 com.pdftron.pdf.Element.e_text -> mOutputListener!!.print("""
159
160 TEXT: ${element.getTextString()}
161
162 """.trimIndent())
163 com.pdftron.pdf.Element.e_form -> mOutputListener!!.print("\nFORM XObject: ")
164 }
165
166 // Check if the element is associated with any structural element.
167 // Content items are leaf nodes of the structure tree.
168 val struct_parent: SElement = element.getParentStructElement()
169 if (struct_parent.isValid()) {
170 // Print out the parent structural element's type, title, and object number.
171 mOutputListener!!.print(" Type: " + struct_parent.getType()
172 + ", MCID: " + element.getStructMCID())
173 if (struct_parent.hasTitle()) {
174 mOutputListener!!.print(". Title: " + struct_parent.getTitle())
175 }
176 mOutputListener!!.print(", Obj#: " + struct_parent.getSDFObj().getObjNum())
177 }
178 }
179 }
180 }
181
182 // Used in code snippet 3.
183 //typedef map<int, string> MCIDPageMap;
184 //typedef map<int, MCIDPageMap> MCIDDocMap;
185 // Used in code snippet 3.
186 @Throws(PDFNetException::class)
187 fun ProcessElements2(reader: com.pdftron.pdf.ElementReader, mcid_page_map: MutableMap<Int, String>) {
188 var element: com.pdftron.pdf.Element?
189 // Read page contents
190 while (true) {
191 element = reader.next()
192 if (element == null) {
193 break
194 }
195 // In this sample we process only text, but the code can be extended
196 // to handle paths, images, or any other Element type.
197 val mcid: Int = element.getStructMCID()
198 if (mcid >= 0 && element.getType() == com.pdftron.pdf.Element.e_text) {
199 val `val`: String = element.getTextString()
200 if (mcid_page_map.containsKey(mcid)) mcid_page_map.put(mcid, mcid_page_map[mcid] + `val`) else mcid_page_map.put(mcid, `val`)
201 }
202 }
203 }
204
205 // Used in code snippet 3.
206 @Throws(PDFNetException::class)
207 fun ProcessStructElement2(element: SElement, mcid_doc_map: Map<Int, Map<Int, String>>, indent: Int) {
208 if (!element.isValid()) {
209 return
210 }
211
212 // Print out the type and title info, if any.
213 PrintIndent(indent)
214 mOutputListener!!.print("<" + element.getType())
215 if (element.hasTitle()) {
216 mOutputListener!!.print(" title=\"" + element.getTitle() + "\"")
217 }
218 mOutputListener!!.print(">")
219 val num: Int = element.getNumKids()
220 for (i in 0 until num) {
221 if (element.isContentItem(i)) {
222 val cont: ContentItem = element.getAsContentItem(i)
223 if (cont.getType() == ContentItem.e_MCID) {
224 val page_num: Int = cont.getPage().getIndex()
225 if (mcid_doc_map.containsKey(page_num)) {
226 val mcid_page_map = mcid_doc_map[page_num]!!
227 val mcid_key: Int = cont.getMCID()
228 if (mcid_page_map.containsKey(mcid_key)) {
229 mOutputListener!!.print(mcid_page_map[mcid_key])
230 }
231 }
232 }
233 } else { // the kid is another StructElement node.
234 ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1)
235 }
236 }
237 PrintIndent(indent)
238 mOutputListener!!.print("</" + element.getType() + ">")
239 }
240 /**
241 * @param args
242 */
243 }
244
245 init {
246 setTitle(R.string.sample_logicalstructure_title)
247 setDescription(R.string.sample_logicalstructure_description)
248 }
249}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales