LogicalStructure

Sample Kotlin code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Android SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5package com.pdftron.android.pdfnetsdksamples.samples
6
7import com.pdftron.android.pdfnetsdksamples.OutputListener
8import com.pdftron.android.pdfnetsdksamples.PDFNetSample
9import com.pdftron.android.pdfnetsdksamples.R
10import com.pdftron.android.pdfnetsdksamples.util.Utils
11import com.pdftron.common.PDFNetException
12import com.pdftron.pdf.PDFDoc
13import com.pdftron.pdf.PageIterator
14import com.pdftron.pdf.struct.ContentItem
15import com.pdftron.pdf.struct.SElement
16import com.pdftron.pdf.struct.STree
17import com.pdftron.sdf.SDFDoc
18
19class LogicalStructureTest : PDFNetSample() {
20    override fun run(outputListener: OutputListener?) {
21        super.run(outputListener)
22        mOutputListener = outputListener
23        mFileList.clear()
24        printHeader(outputListener!!)
25        try  // Extract logical structure from a PDF document
26        {
27            PDFDoc(Utils.getAssetTempFile(INPUT_PATH.toString() + "tagged.pdf")!!.getAbsolutePath()).use { doc ->
28                doc.initSecurityHandler()
29                mOutputListener!!.println("____________________________________________________________")
30                mOutputListener!!.println("Sample 1 - Traverse logical structure tree...")
31                run({
32                    val tree: STree = doc.getStructTree()
33                    if (tree.isValid()) {
34                        mOutputListener!!.println("Document has a StructTree root.")
35                        for (i in 0 until tree.getNumKids()) {
36                            // Recursively get structure  info for all all child elements.
37                            ProcessStructElement(tree.getKid(i), 0)
38                        }
39                    } else {
40                        mOutputListener!!.println("This document does not contain any logical structure.")
41                    }
42                })
43                mOutputListener!!.println("\nDone 1.")
44                mOutputListener!!.println("____________________________________________________________")
45                mOutputListener!!.println("Sample 2 - Get parent logical structure elements from")
46                mOutputListener!!.println("layout elements.")
47                run({
48                    val reader: com.pdftron.pdf.ElementReader = com.pdftron.pdf.ElementReader()
49                    val itr: PageIterator = doc.getPageIterator()
50                    while (itr.hasNext()) {
51                        reader.begin(itr.next())
52                        ProcessElements(reader)
53                        reader.end()
54                    }
55                })
56                mOutputListener!!.println("\nDone 2.")
57                mOutputListener!!.println("____________________________________________________________")
58                mOutputListener!!.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
59                run({
60
61                    //A map which maps page numbers(as Integers)
62                    //to page Maps(which map from struct mcid(as Integers) to
63                    //text Strings)
64                    val mcid_doc_map: MutableMap<Int, Map<Int, String>> = java.util.TreeMap<Int, Map<Int, String>>()
65                    val reader: com.pdftron.pdf.ElementReader = com.pdftron.pdf.ElementReader()
66                    val itr: PageIterator = doc.getPageIterator()
67                    while (itr.hasNext()) {
68                        val current: com.pdftron.pdf.Page? = itr.next()
69                        reader.begin(current)
70                        val page_mcid_map: MutableMap<Int, String> = java.util.TreeMap<Int, String>()
71                        mcid_doc_map.put(current!!.getIndex(), page_mcid_map)
72                        ProcessElements2(reader, page_mcid_map)
73                        reader.end()
74                    }
75                    val tree: STree = doc.getStructTree()
76                    if (tree.isValid()) {
77                        for (i in 0 until tree.getNumKids()) {
78                            ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0)
79                        }
80                    }
81                })
82                mOutputListener!!.println("\nDone 3.")
83                doc.save(Utils.createExternalFile("LogicalStructure.pdf", mFileList).getAbsolutePath(), SDFDoc.SaveMode.LINEARIZED, null)
84            }
85        } catch (e: java.lang.Exception) {
86            mOutputListener!!.printError(e.getStackTrace())
87        }
88        for (file in mFileList) {
89            addToFileList(file)
90        }
91        printFooter(outputListener)
92    }
93
94    companion object {
95        private var mOutputListener: OutputListener? = null
96        private val mFileList: java.util.ArrayList<String> = java.util.ArrayList<String>()
97        fun PrintIndent(indent: Int) {
98            mOutputListener!!.println()
99            for (i in 0 until indent) mOutputListener!!.print("  ")
100        }
101
102        // Used in code snippet 1.
103        @Throws(PDFNetException::class)
104        fun ProcessStructElement(element: SElement, indent: Int) {
105            var indent = indent
106            if (!element.isValid()) {
107                return
108            }
109
110            // Print out the type and title info, if any.
111            PrintIndent(indent++)
112            mOutputListener!!.print("Type: " + element.getType())
113            if (element.hasTitle()) {
114                mOutputListener!!.print(". Title: " + element.getTitle())
115            }
116            val num: Int = element.getNumKids()
117            for (i in 0 until num) {
118                // Check is the kid is a leaf node (i.e. it is a ContentItem).
119                if (element.isContentItem(i)) {
120                    val cont: ContentItem = element.getAsContentItem(i)
121                    val type: Int = cont.getType()
122                    val page: com.pdftron.pdf.Page = cont.getPage()
123                    PrintIndent(indent)
124                    mOutputListener!!.print("Content Item. Part of page #" + page.getIndex())
125                    PrintIndent(indent)
126                    when (type) {
127                        ContentItem.e_MCID, ContentItem.e_MCR -> mOutputListener!!.print("MCID: " + cont.getMCID())
128                        ContentItem.e_OBJR -> {
129                            mOutputListener!!.print("OBJR ")
130                            val ref_obj: com.pdftron.sdf.Obj = cont.getRefObj()
131                            if (ref_obj != null) mOutputListener!!.print("- Referenced Object#: " + ref_obj.getObjNum())
132                        }
133                        else -> {
134                        }
135                    }
136                } else {  // the kid is another StructElement node.
137                    ProcessStructElement(element.getAsStructElem(i), indent)
138                }
139            }
140        }
141
142        // Used in code snippet 2.
143        @Throws(PDFNetException::class)
144        fun ProcessElements(reader: com.pdftron.pdf.ElementReader) {
145            var element: com.pdftron.pdf.Element?
146            // Read page contents
147            while (true) {
148                element = reader.next()
149                if (element == null) {
150                    break
151                }
152                // In this sample we process only paths & text, but the code can be
153                // extended to handle any element type.
154                val type: Int = element.getType()
155                if (type == com.pdftron.pdf.Element.e_path || type == com.pdftron.pdf.Element.e_text || type == com.pdftron.pdf.Element.e_path) {
156                    when (type) {
157                        com.pdftron.pdf.Element.e_path -> mOutputListener!!.print("\nPATH: ")
158                        com.pdftron.pdf.Element.e_text -> mOutputListener!!.print("""
159    
160    TEXT: ${element.getTextString()}
161    
162    """.trimIndent())
163                        com.pdftron.pdf.Element.e_form -> mOutputListener!!.print("\nFORM XObject: ")
164                    }
165
166                    // Check if the element is associated with any structural element.
167                    // Content items are leaf nodes of the structure tree.
168                    val struct_parent: SElement = element.getParentStructElement()
169                    if (struct_parent.isValid()) {
170                        // Print out the parent structural element's type, title, and object number.
171                        mOutputListener!!.print(" Type: " + struct_parent.getType()
172                                + ", MCID: " + element.getStructMCID())
173                        if (struct_parent.hasTitle()) {
174                            mOutputListener!!.print(". Title: " + struct_parent.getTitle())
175                        }
176                        mOutputListener!!.print(", Obj#: " + struct_parent.getSDFObj().getObjNum())
177                    }
178                }
179            }
180        }
181
182        // Used in code snippet 3.
183        //typedef map<int, string> MCIDPageMap;
184        //typedef map<int, MCIDPageMap> MCIDDocMap;
185        // Used in code snippet 3.
186        @Throws(PDFNetException::class)
187        fun ProcessElements2(reader: com.pdftron.pdf.ElementReader, mcid_page_map: MutableMap<Int, String>) {
188            var element: com.pdftron.pdf.Element?
189            // Read page contents
190            while (true) {
191                element = reader.next()
192                if (element == null) {
193                    break
194                }
195                // In this sample we process only text, but the code can be extended
196                // to handle paths, images, or any other Element type.
197                val mcid: Int = element.getStructMCID()
198                if (mcid >= 0 && element.getType() == com.pdftron.pdf.Element.e_text) {
199                    val `val`: String = element.getTextString()
200                    if (mcid_page_map.containsKey(mcid)) mcid_page_map.put(mcid, mcid_page_map[mcid] + `val`) else mcid_page_map.put(mcid, `val`)
201                }
202            }
203        }
204
205        // Used in code snippet 3.
206        @Throws(PDFNetException::class)
207        fun ProcessStructElement2(element: SElement, mcid_doc_map: Map<Int, Map<Int, String>>, indent: Int) {
208            if (!element.isValid()) {
209                return
210            }
211
212            // Print out the type and title info, if any.
213            PrintIndent(indent)
214            mOutputListener!!.print("<" + element.getType())
215            if (element.hasTitle()) {
216                mOutputListener!!.print(" title=\"" + element.getTitle() + "\"")
217            }
218            mOutputListener!!.print(">")
219            val num: Int = element.getNumKids()
220            for (i in 0 until num) {
221                if (element.isContentItem(i)) {
222                    val cont: ContentItem = element.getAsContentItem(i)
223                    if (cont.getType() == ContentItem.e_MCID) {
224                        val page_num: Int = cont.getPage().getIndex()
225                        if (mcid_doc_map.containsKey(page_num)) {
226                            val mcid_page_map = mcid_doc_map[page_num]!!
227                            val mcid_key: Int = cont.getMCID()
228                            if (mcid_page_map.containsKey(mcid_key)) {
229                                mOutputListener!!.print(mcid_page_map[mcid_key])
230                            }
231                        }
232                    }
233                } else {  // the kid is another StructElement node.
234                    ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1)
235                }
236            }
237            PrintIndent(indent)
238            mOutputListener!!.print("</" + element.getType() + ">")
239        }
240        /**
241         * @param args
242         */
243    }
244
245    init {
246        setTitle(R.string.sample_logicalstructure_title)
247        setDescription(R.string.sample_logicalstructure_description)
248    }
249}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.PDFNetException;
13import com.pdftron.pdf.Element;
14import com.pdftron.pdf.ElementReader;
15import com.pdftron.pdf.PDFDoc;
16import com.pdftron.pdf.Page;
17import com.pdftron.pdf.PageIterator;
18import com.pdftron.pdf.struct.ContentItem;
19import com.pdftron.pdf.struct.SElement;
20import com.pdftron.pdf.struct.STree;
21import com.pdftron.sdf.Obj;
22import com.pdftron.sdf.SDFDoc;
23
24import java.util.ArrayList;
25import java.util.Map;
26import java.util.TreeMap;
27
28public class LogicalStructureTest extends PDFNetSample {
29
30	private static OutputListener mOutputListener;
31
32	private static ArrayList<String> mFileList = new ArrayList<>();
33
34    public LogicalStructureTest() {
35        setTitle(R.string.sample_logicalstructure_title);
36        setDescription(R.string.sample_logicalstructure_description);
37    }
38
39	@Override
40	public void run(OutputListener outputListener) {
41		super.run(outputListener);
42		mOutputListener = outputListener;
43		mFileList.clear();
44		printHeader(outputListener);
45
46        try (PDFDoc doc = new PDFDoc((Utils.getAssetTempFile(INPUT_PATH + "tagged.pdf").getAbsolutePath())))    // Extract logical structure from a PDF document
47        {
48            doc.initSecurityHandler();
49
50            mOutputListener.println("____________________________________________________________");
51            mOutputListener.println("Sample 1 - Traverse logical structure tree...");
52            {
53                STree tree = doc.getStructTree();
54                if (tree.isValid()) {
55                    mOutputListener.println("Document has a StructTree root.");
56
57                    for (int i = 0; i < tree.getNumKids(); ++i) {
58                        // Recursively get structure  info for all all child elements.
59                        ProcessStructElement(tree.getKid(i), 0);
60                    }
61                } else {
62                    mOutputListener.println("This document does not contain any logical structure.");
63                }
64            }
65            mOutputListener.println("\nDone 1.");
66
67            mOutputListener.println("____________________________________________________________");
68            mOutputListener.println("Sample 2 - Get parent logical structure elements from");
69            mOutputListener.println("layout elements.");
70            {
71                ElementReader reader = new ElementReader();
72                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
73                    reader.begin(itr.next());
74                    ProcessElements(reader);
75                    reader.end();
76                }
77            }
78            mOutputListener.println("\nDone 2.");
79
80            mOutputListener.println("____________________________________________________________");
81            mOutputListener.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
82            {
83                //A map which maps page numbers(as Integers)
84                //to page Maps(which map from struct mcid(as Integers) to
85                //text Strings)
86                Map<Integer, Map<Integer, String>> mcid_doc_map = new TreeMap<Integer, Map<Integer, String>>();
87                ElementReader reader = new ElementReader();
88                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
89                    Page current = itr.next();
90                    reader.begin(current);
91                    Map<Integer, String> page_mcid_map = new TreeMap<Integer, String>();
92                    mcid_doc_map.put(new Integer(current.getIndex()), page_mcid_map);
93                    ProcessElements2(reader, page_mcid_map);
94                    reader.end();
95                }
96
97                STree tree = doc.getStructTree();
98                if (tree.isValid()) {
99                    for (int i = 0; i < tree.getNumKids(); ++i) {
100                        ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0);
101                    }
102                }
103            }
104            mOutputListener.println("\nDone 3.");
105            doc.save((Utils.createExternalFile("LogicalStructure.pdf", mFileList).getAbsolutePath()), SDFDoc.SaveMode.LINEARIZED, null);
106        } catch (Exception e) {
107            mOutputListener.printError(e.getStackTrace());
108        }
109
110		for (String file : mFileList) {
111			addToFileList(file);
112		}
113		printFooter(outputListener);
114	}
115    static void PrintIndent(int indent) {
116        mOutputListener.println();
117        for (int i = 0; i < indent; ++i) mOutputListener.print("  ");
118    }
119
120    // Used in code snippet 1.
121    static void ProcessStructElement(SElement element, int indent) throws PDFNetException {
122        if (!element.isValid()) {
123            return;
124        }
125
126        // Print out the type and title info, if any.
127        PrintIndent(indent++);
128        mOutputListener.print("Type: " + element.getType());
129        if (element.hasTitle()) {
130            mOutputListener.print(". Title: " + element.getTitle());
131        }
132
133        int num = element.getNumKids();
134        for (int i = 0; i < num; ++i) {
135            // Check is the kid is a leaf node (i.e. it is a ContentItem).
136            if (element.isContentItem(i)) {
137                ContentItem cont = element.getAsContentItem(i);
138                int type = cont.getType();
139
140                Page page = cont.getPage();
141
142                PrintIndent(indent);
143                mOutputListener.print("Content Item. Part of page #" + page.getIndex());
144
145                PrintIndent(indent);
146                switch (type) {
147                    case ContentItem.e_MCID:
148                    case ContentItem.e_MCR:
149                        mOutputListener.print("MCID: " + cont.getMCID());
150                        break;
151                    case ContentItem.e_OBJR: {
152                        mOutputListener.print("OBJR ");
153                        Obj ref_obj = cont.getRefObj();
154                        if (ref_obj != null)
155                            mOutputListener.print("- Referenced Object#: " + ref_obj.getObjNum());
156                    }
157                    break;
158                    default:
159                        break;
160                }
161            } else {  // the kid is another StructElement node.
162                ProcessStructElement(element.getAsStructElem(i), indent);
163            }
164        }
165    }
166
167    // Used in code snippet 2.
168    static void ProcessElements(ElementReader reader) throws PDFNetException {
169        Element element;
170        while ((element = reader.next()) != null)    // Read page contents
171        {
172            // In this sample we process only paths & text, but the code can be
173            // extended to handle any element type.
174            int type = element.getType();
175            if (type == Element.e_path || type == Element.e_text || type == Element.e_path) {
176                switch (type) {
177                    case Element.e_path:                // Process path ...
178                        mOutputListener.print("\nPATH: ");
179                        break;
180                    case Element.e_text:                // Process text ...
181                        mOutputListener.print("\nTEXT: " + element.getTextString() + "\n");
182                        break;
183                    case Element.e_form:                // Process form XObjects
184                        mOutputListener.print("\nFORM XObject: ");
185                        //reader.FormBegin();
186                        //ProcessElements(reader);
187                        //reader.End();
188                        break;
189                }
190
191                // Check if the element is associated with any structural element.
192                // Content items are leaf nodes of the structure tree.
193                SElement struct_parent = element.getParentStructElement();
194                if (struct_parent.isValid()) {
195                    // Print out the parent structural element's type, title, and object number.
196                    mOutputListener.print(" Type: " + struct_parent.getType()
197                            + ", MCID: " + element.getStructMCID());
198                    if (struct_parent.hasTitle()) {
199                        mOutputListener.print(". Title: " + struct_parent.getTitle());
200                    }
201                    mOutputListener.print(", Obj#: " + struct_parent.getSDFObj().getObjNum());
202                }
203            }
204        }
205    }
206
207    // Used in code snippet 3.
208    //typedef map<int, string> MCIDPageMap;
209    //typedef map<int, MCIDPageMap> MCIDDocMap;
210
211    // Used in code snippet 3.
212    static void ProcessElements2(ElementReader reader, Map<Integer, String> mcid_page_map) throws PDFNetException {
213        Element element;
214        while ((element = reader.next()) != null) // Read page contents
215        {
216            // In this sample we process only text, but the code can be extended
217            // to handle paths, images, or any other Element type.
218            int mcid = element.getStructMCID();
219            Integer key_mcid = new Integer(mcid);
220            if (mcid >= 0 && element.getType() == Element.e_text) {
221                String val = element.getTextString();
222                if (mcid_page_map.containsKey(key_mcid))
223                    mcid_page_map.put(key_mcid, ((String) (mcid_page_map.get(key_mcid)) + val));
224                else mcid_page_map.put(key_mcid, val);
225            }
226        }
227    }
228
229    // Used in code snippet 3.
230    static void ProcessStructElement2(SElement element, Map<Integer, Map<Integer, String>> mcid_doc_map, int indent) throws PDFNetException {
231        if (!element.isValid()) {
232            return;
233        }
234
235        // Print out the type and title info, if any.
236        PrintIndent(indent);
237        mOutputListener.print("<" + element.getType());
238        if (element.hasTitle()) {
239            mOutputListener.print(" title=\"" + element.getTitle() + "\"");
240        }
241        mOutputListener.print(">");
242
243        int num = element.getNumKids();
244        for (int i = 0; i < num; ++i) {
245            if (element.isContentItem(i)) {
246                ContentItem cont = element.getAsContentItem(i);
247                if (cont.getType() == ContentItem.e_MCID) {
248                    int page_num = cont.getPage().getIndex();
249                    Integer page_num_key = new Integer(page_num);
250                    if (mcid_doc_map.containsKey(page_num_key)) {
251                        Map<Integer, String> mcid_page_map = mcid_doc_map.get(page_num_key);
252                        Integer mcid_key = new Integer(cont.getMCID());
253                        if (mcid_page_map.containsKey(mcid_key)) {
254                            mOutputListener.print(mcid_page_map.get(mcid_key));
255                        }
256                    }
257                }
258            } else {  // the kid is another StructElement node.
259                ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1);
260            }
261        }
262
263        PrintIndent(indent);
264        mOutputListener.print("</" + element.getType() + ">");
265    }
266
267    /**
268     * @param args
269     */
270
271}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

LogicalStructure