Sample Kotlin code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Android SDK and PDF Data Extraction SDK Capabilities.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5package com.pdftron.android.pdfnetsdksamples.samples
6
7import com.pdftron.android.pdfnetsdksamples.OutputListener
8import com.pdftron.android.pdfnetsdksamples.PDFNetSample
9import com.pdftron.android.pdfnetsdksamples.R
10import com.pdftron.android.pdfnetsdksamples.util.Utils
11import com.pdftron.common.PDFNetException
12import com.pdftron.pdf.PDFDoc
13import com.pdftron.pdf.PageIterator
14import com.pdftron.pdf.struct.ContentItem
15import com.pdftron.pdf.struct.SElement
16import com.pdftron.pdf.struct.STree
17import com.pdftron.sdf.SDFDoc
18
19class LogicalStructureTest : PDFNetSample() {
20 override fun run(outputListener: OutputListener?) {
21 super.run(outputListener)
22 mOutputListener = outputListener
23 mFileList.clear()
24 printHeader(outputListener!!)
25 try // Extract logical structure from a PDF document
26 {
27 PDFDoc(Utils.getAssetTempFile(INPUT_PATH.toString() + "tagged.pdf")!!.getAbsolutePath()).use { doc ->
28 doc.initSecurityHandler()
29 mOutputListener!!.println("____________________________________________________________")
30 mOutputListener!!.println("Sample 1 - Traverse logical structure tree...")
31 run({
32 val tree: STree = doc.getStructTree()
33 if (tree.isValid()) {
34 mOutputListener!!.println("Document has a StructTree root.")
35 for (i in 0 until tree.getNumKids()) {
36 // Recursively get structure info for all all child elements.
37 ProcessStructElement(tree.getKid(i), 0)
38 }
39 } else {
40 mOutputListener!!.println("This document does not contain any logical structure.")
41 }
42 })
43 mOutputListener!!.println("\nDone 1.")
44 mOutputListener!!.println("____________________________________________________________")
45 mOutputListener!!.println("Sample 2 - Get parent logical structure elements from")
46 mOutputListener!!.println("layout elements.")
47 run({
48 val reader: com.pdftron.pdf.ElementReader = com.pdftron.pdf.ElementReader()
49 val itr: PageIterator = doc.getPageIterator()
50 while (itr.hasNext()) {
51 reader.begin(itr.next())
52 ProcessElements(reader)
53 reader.end()
54 }
55 })
56 mOutputListener!!.println("\nDone 2.")
57 mOutputListener!!.println("____________________________________________________________")
58 mOutputListener!!.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
59 run({
60
61 //A map which maps page numbers(as Integers)
62 //to page Maps(which map from struct mcid(as Integers) to
63 //text Strings)
64 val mcid_doc_map: MutableMap<Int, Map<Int, String>> = java.util.TreeMap<Int, Map<Int, String>>()
65 val reader: com.pdftron.pdf.ElementReader = com.pdftron.pdf.ElementReader()
66 val itr: PageIterator = doc.getPageIterator()
67 while (itr.hasNext()) {
68 val current: com.pdftron.pdf.Page? = itr.next()
69 reader.begin(current)
70 val page_mcid_map: MutableMap<Int, String> = java.util.TreeMap<Int, String>()
71 mcid_doc_map.put(current!!.getIndex(), page_mcid_map)
72 ProcessElements2(reader, page_mcid_map)
73 reader.end()
74 }
75 val tree: STree = doc.getStructTree()
76 if (tree.isValid()) {
77 for (i in 0 until tree.getNumKids()) {
78 ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0)
79 }
80 }
81 })
82 mOutputListener!!.println("\nDone 3.")
83 doc.save(Utils.createExternalFile("LogicalStructure.pdf", mFileList).getAbsolutePath(), SDFDoc.SaveMode.LINEARIZED, null)
84 }
85 } catch (e: java.lang.Exception) {
86 mOutputListener!!.printError(e.getStackTrace())
87 }
88 for (file in mFileList) {
89 addToFileList(file)
90 }
91 printFooter(outputListener)
92 }
93
94 companion object {
95 private var mOutputListener: OutputListener? = null
96 private val mFileList: java.util.ArrayList<String> = java.util.ArrayList<String>()
97 fun PrintIndent(indent: Int) {
98 mOutputListener!!.println()
99 for (i in 0 until indent) mOutputListener!!.print(" ")
100 }
101
102 // Used in code snippet 1.
103 @Throws(PDFNetException::class)
104 fun ProcessStructElement(element: SElement, indent: Int) {
105 var indent = indent
106 if (!element.isValid()) {
107 return
108 }
109
110 // Print out the type and title info, if any.
111 PrintIndent(indent++)
112 mOutputListener!!.print("Type: " + element.getType())
113 if (element.hasTitle()) {
114 mOutputListener!!.print(". Title: " + element.getTitle())
115 }
116 val num: Int = element.getNumKids()
117 for (i in 0 until num) {
118 // Check is the kid is a leaf node (i.e. it is a ContentItem).
119 if (element.isContentItem(i)) {
120 val cont: ContentItem = element.getAsContentItem(i)
121 val type: Int = cont.getType()
122 val page: com.pdftron.pdf.Page = cont.getPage()
123 PrintIndent(indent)
124 mOutputListener!!.print("Content Item. Part of page #" + page.getIndex())
125 PrintIndent(indent)
126 when (type) {
127 ContentItem.e_MCID, ContentItem.e_MCR -> mOutputListener!!.print("MCID: " + cont.getMCID())
128 ContentItem.e_OBJR -> {
129 mOutputListener!!.print("OBJR ")
130 val ref_obj: com.pdftron.sdf.Obj = cont.getRefObj()
131 if (ref_obj != null) mOutputListener!!.print("- Referenced Object#: " + ref_obj.getObjNum())
132 }
133 else -> {
134 }
135 }
136 } else { // the kid is another StructElement node.
137 ProcessStructElement(element.getAsStructElem(i), indent)
138 }
139 }
140 }
141
142 // Used in code snippet 2.
143 @Throws(PDFNetException::class)
144 fun ProcessElements(reader: com.pdftron.pdf.ElementReader) {
145 var element: com.pdftron.pdf.Element?
146 // Read page contents
147 while (true) {
148 element = reader.next()
149 if (element == null) {
150 break
151 }
152 // In this sample we process only paths & text, but the code can be
153 // extended to handle any element type.
154 val type: Int = element.getType()
155 if (type == com.pdftron.pdf.Element.e_path || type == com.pdftron.pdf.Element.e_text || type == com.pdftron.pdf.Element.e_path) {
156 when (type) {
157 com.pdftron.pdf.Element.e_path -> mOutputListener!!.print("\nPATH: ")
158 com.pdftron.pdf.Element.e_text -> mOutputListener!!.print("""
159
160 TEXT: ${element.getTextString()}
161
162 """.trimIndent())
163 com.pdftron.pdf.Element.e_form -> mOutputListener!!.print("\nFORM XObject: ")
164 }
165
166 // Check if the element is associated with any structural element.
167 // Content items are leaf nodes of the structure tree.
168 val struct_parent: SElement = element.getParentStructElement()
169 if (struct_parent.isValid()) {
170 // Print out the parent structural element's type, title, and object number.
171 mOutputListener!!.print(" Type: " + struct_parent.getType()
172 + ", MCID: " + element.getStructMCID())
173 if (struct_parent.hasTitle()) {
174 mOutputListener!!.print(". Title: " + struct_parent.getTitle())
175 }
176 mOutputListener!!.print(", Obj#: " + struct_parent.getSDFObj().getObjNum())
177 }
178 }
179 }
180 }
181
182 // Used in code snippet 3.
183 //typedef map<int, string> MCIDPageMap;
184 //typedef map<int, MCIDPageMap> MCIDDocMap;
185 // Used in code snippet 3.
186 @Throws(PDFNetException::class)
187 fun ProcessElements2(reader: com.pdftron.pdf.ElementReader, mcid_page_map: MutableMap<Int, String>) {
188 var element: com.pdftron.pdf.Element?
189 // Read page contents
190 while (true) {
191 element = reader.next()
192 if (element == null) {
193 break
194 }
195 // In this sample we process only text, but the code can be extended
196 // to handle paths, images, or any other Element type.
197 val mcid: Int = element.getStructMCID()
198 if (mcid >= 0 && element.getType() == com.pdftron.pdf.Element.e_text) {
199 val `val`: String = element.getTextString()
200 if (mcid_page_map.containsKey(mcid)) mcid_page_map.put(mcid, mcid_page_map[mcid] + `val`) else mcid_page_map.put(mcid, `val`)
201 }
202 }
203 }
204
205 // Used in code snippet 3.
206 @Throws(PDFNetException::class)
207 fun ProcessStructElement2(element: SElement, mcid_doc_map: Map<Int, Map<Int, String>>, indent: Int) {
208 if (!element.isValid()) {
209 return
210 }
211
212 // Print out the type and title info, if any.
213 PrintIndent(indent)
214 mOutputListener!!.print("<" + element.getType())
215 if (element.hasTitle()) {
216 mOutputListener!!.print(" title=\"" + element.getTitle() + "\"")
217 }
218 mOutputListener!!.print(">")
219 val num: Int = element.getNumKids()
220 for (i in 0 until num) {
221 if (element.isContentItem(i)) {
222 val cont: ContentItem = element.getAsContentItem(i)
223 if (cont.getType() == ContentItem.e_MCID) {
224 val page_num: Int = cont.getPage().getIndex()
225 if (mcid_doc_map.containsKey(page_num)) {
226 val mcid_page_map = mcid_doc_map[page_num]!!
227 val mcid_key: Int = cont.getMCID()
228 if (mcid_page_map.containsKey(mcid_key)) {
229 mOutputListener!!.print(mcid_page_map[mcid_key])
230 }
231 }
232 }
233 } else { // the kid is another StructElement node.
234 ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1)
235 }
236 }
237 PrintIndent(indent)
238 mOutputListener!!.print("</" + element.getType() + ">")
239 }
240 /**
241 * @param args
242 */
243 }
244
245 init {
246 setTitle(R.string.sample_logicalstructure_title)
247 setDescription(R.string.sample_logicalstructure_description)
248 }
249}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.PDFNetException;
13import com.pdftron.pdf.Element;
14import com.pdftron.pdf.ElementReader;
15import com.pdftron.pdf.PDFDoc;
16import com.pdftron.pdf.Page;
17import com.pdftron.pdf.PageIterator;
18import com.pdftron.pdf.struct.ContentItem;
19import com.pdftron.pdf.struct.SElement;
20import com.pdftron.pdf.struct.STree;
21import com.pdftron.sdf.Obj;
22import com.pdftron.sdf.SDFDoc;
23
24import java.util.ArrayList;
25import java.util.Map;
26import java.util.TreeMap;
27
28public class LogicalStructureTest extends PDFNetSample {
29
30 private static OutputListener mOutputListener;
31
32 private static ArrayList<String> mFileList = new ArrayList<>();
33
34 public LogicalStructureTest() {
35 setTitle(R.string.sample_logicalstructure_title);
36 setDescription(R.string.sample_logicalstructure_description);
37 }
38
39 @Override
40 public void run(OutputListener outputListener) {
41 super.run(outputListener);
42 mOutputListener = outputListener;
43 mFileList.clear();
44 printHeader(outputListener);
45
46 try (PDFDoc doc = new PDFDoc((Utils.getAssetTempFile(INPUT_PATH + "tagged.pdf").getAbsolutePath()))) // Extract logical structure from a PDF document
47 {
48 doc.initSecurityHandler();
49
50 mOutputListener.println("____________________________________________________________");
51 mOutputListener.println("Sample 1 - Traverse logical structure tree...");
52 {
53 STree tree = doc.getStructTree();
54 if (tree.isValid()) {
55 mOutputListener.println("Document has a StructTree root.");
56
57 for (int i = 0; i < tree.getNumKids(); ++i) {
58 // Recursively get structure info for all all child elements.
59 ProcessStructElement(tree.getKid(i), 0);
60 }
61 } else {
62 mOutputListener.println("This document does not contain any logical structure.");
63 }
64 }
65 mOutputListener.println("\nDone 1.");
66
67 mOutputListener.println("____________________________________________________________");
68 mOutputListener.println("Sample 2 - Get parent logical structure elements from");
69 mOutputListener.println("layout elements.");
70 {
71 ElementReader reader = new ElementReader();
72 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
73 reader.begin(itr.next());
74 ProcessElements(reader);
75 reader.end();
76 }
77 }
78 mOutputListener.println("\nDone 2.");
79
80 mOutputListener.println("____________________________________________________________");
81 mOutputListener.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
82 {
83 //A map which maps page numbers(as Integers)
84 //to page Maps(which map from struct mcid(as Integers) to
85 //text Strings)
86 Map<Integer, Map<Integer, String>> mcid_doc_map = new TreeMap<Integer, Map<Integer, String>>();
87 ElementReader reader = new ElementReader();
88 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
89 Page current = itr.next();
90 reader.begin(current);
91 Map<Integer, String> page_mcid_map = new TreeMap<Integer, String>();
92 mcid_doc_map.put(new Integer(current.getIndex()), page_mcid_map);
93 ProcessElements2(reader, page_mcid_map);
94 reader.end();
95 }
96
97 STree tree = doc.getStructTree();
98 if (tree.isValid()) {
99 for (int i = 0; i < tree.getNumKids(); ++i) {
100 ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0);
101 }
102 }
103 }
104 mOutputListener.println("\nDone 3.");
105 doc.save((Utils.createExternalFile("LogicalStructure.pdf", mFileList).getAbsolutePath()), SDFDoc.SaveMode.LINEARIZED, null);
106 } catch (Exception e) {
107 mOutputListener.printError(e.getStackTrace());
108 }
109
110 for (String file : mFileList) {
111 addToFileList(file);
112 }
113 printFooter(outputListener);
114 }
115 static void PrintIndent(int indent) {
116 mOutputListener.println();
117 for (int i = 0; i < indent; ++i) mOutputListener.print(" ");
118 }
119
120 // Used in code snippet 1.
121 static void ProcessStructElement(SElement element, int indent) throws PDFNetException {
122 if (!element.isValid()) {
123 return;
124 }
125
126 // Print out the type and title info, if any.
127 PrintIndent(indent++);
128 mOutputListener.print("Type: " + element.getType());
129 if (element.hasTitle()) {
130 mOutputListener.print(". Title: " + element.getTitle());
131 }
132
133 int num = element.getNumKids();
134 for (int i = 0; i < num; ++i) {
135 // Check is the kid is a leaf node (i.e. it is a ContentItem).
136 if (element.isContentItem(i)) {
137 ContentItem cont = element.getAsContentItem(i);
138 int type = cont.getType();
139
140 Page page = cont.getPage();
141
142 PrintIndent(indent);
143 mOutputListener.print("Content Item. Part of page #" + page.getIndex());
144
145 PrintIndent(indent);
146 switch (type) {
147 case ContentItem.e_MCID:
148 case ContentItem.e_MCR:
149 mOutputListener.print("MCID: " + cont.getMCID());
150 break;
151 case ContentItem.e_OBJR: {
152 mOutputListener.print("OBJR ");
153 Obj ref_obj = cont.getRefObj();
154 if (ref_obj != null)
155 mOutputListener.print("- Referenced Object#: " + ref_obj.getObjNum());
156 }
157 break;
158 default:
159 break;
160 }
161 } else { // the kid is another StructElement node.
162 ProcessStructElement(element.getAsStructElem(i), indent);
163 }
164 }
165 }
166
167 // Used in code snippet 2.
168 static void ProcessElements(ElementReader reader) throws PDFNetException {
169 Element element;
170 while ((element = reader.next()) != null) // Read page contents
171 {
172 // In this sample we process only paths & text, but the code can be
173 // extended to handle any element type.
174 int type = element.getType();
175 if (type == Element.e_path || type == Element.e_text || type == Element.e_path) {
176 switch (type) {
177 case Element.e_path: // Process path ...
178 mOutputListener.print("\nPATH: ");
179 break;
180 case Element.e_text: // Process text ...
181 mOutputListener.print("\nTEXT: " + element.getTextString() + "\n");
182 break;
183 case Element.e_form: // Process form XObjects
184 mOutputListener.print("\nFORM XObject: ");
185 //reader.FormBegin();
186 //ProcessElements(reader);
187 //reader.End();
188 break;
189 }
190
191 // Check if the element is associated with any structural element.
192 // Content items are leaf nodes of the structure tree.
193 SElement struct_parent = element.getParentStructElement();
194 if (struct_parent.isValid()) {
195 // Print out the parent structural element's type, title, and object number.
196 mOutputListener.print(" Type: " + struct_parent.getType()
197 + ", MCID: " + element.getStructMCID());
198 if (struct_parent.hasTitle()) {
199 mOutputListener.print(". Title: " + struct_parent.getTitle());
200 }
201 mOutputListener.print(", Obj#: " + struct_parent.getSDFObj().getObjNum());
202 }
203 }
204 }
205 }
206
207 // Used in code snippet 3.
208 //typedef map<int, string> MCIDPageMap;
209 //typedef map<int, MCIDPageMap> MCIDDocMap;
210
211 // Used in code snippet 3.
212 static void ProcessElements2(ElementReader reader, Map<Integer, String> mcid_page_map) throws PDFNetException {
213 Element element;
214 while ((element = reader.next()) != null) // Read page contents
215 {
216 // In this sample we process only text, but the code can be extended
217 // to handle paths, images, or any other Element type.
218 int mcid = element.getStructMCID();
219 Integer key_mcid = new Integer(mcid);
220 if (mcid >= 0 && element.getType() == Element.e_text) {
221 String val = element.getTextString();
222 if (mcid_page_map.containsKey(key_mcid))
223 mcid_page_map.put(key_mcid, ((String) (mcid_page_map.get(key_mcid)) + val));
224 else mcid_page_map.put(key_mcid, val);
225 }
226 }
227 }
228
229 // Used in code snippet 3.
230 static void ProcessStructElement2(SElement element, Map<Integer, Map<Integer, String>> mcid_doc_map, int indent) throws PDFNetException {
231 if (!element.isValid()) {
232 return;
233 }
234
235 // Print out the type and title info, if any.
236 PrintIndent(indent);
237 mOutputListener.print("<" + element.getType());
238 if (element.hasTitle()) {
239 mOutputListener.print(" title=\"" + element.getTitle() + "\"");
240 }
241 mOutputListener.print(">");
242
243 int num = element.getNumKids();
244 for (int i = 0; i < num; ++i) {
245 if (element.isContentItem(i)) {
246 ContentItem cont = element.getAsContentItem(i);
247 if (cont.getType() == ContentItem.e_MCID) {
248 int page_num = cont.getPage().getIndex();
249 Integer page_num_key = new Integer(page_num);
250 if (mcid_doc_map.containsKey(page_num_key)) {
251 Map<Integer, String> mcid_page_map = mcid_doc_map.get(page_num_key);
252 Integer mcid_key = new Integer(cont.getMCID());
253 if (mcid_page_map.containsKey(mcid_key)) {
254 mOutputListener.print(mcid_page_map.get(mcid_key));
255 }
256 }
257 }
258 } else { // the kid is another StructElement node.
259 ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1);
260 }
261 }
262
263 PrintIndent(indent);
264 mOutputListener.print("</" + element.getType() + ">");
265 }
266
267 /**
268 * @param args
269 */
270
271}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales