PDF Logical Structure Reader - Java Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.util.Map;
7import java.util.TreeMap;
8
9import com.pdftron.common.PDFNetException;
10import com.pdftron.pdf.struct.*;
11import com.pdftron.pdf.*;
12import com.pdftron.sdf.*;
13
14//---------------------------------------------------------------------------------------
15// This sample explores the structure and content of a tagged PDF document and dumps
16// the structure information to the console window.
17//
18// In tagged PDF documents StructTree acts as a central repository for information
19// related to a PDF document's logical structure. The tree consists of StructElement-s
20// and ContentItem-s which are leaf nodes of the structure tree.
21//
22// The sample can be extended to access and extract the marked-content elements such
23// as text and images.
24//---------------------------------------------------------------------------------------
25public class LogicalStructureTest {
26 static void PrintIndent(int indent) {
27 System.out.println();
28 for (int i = 0; i < indent; ++i) System.out.print(" ");
29 }
30
31 // Used in code snippet 1.
32 static void ProcessStructElement(SElement element, int indent) throws PDFNetException {
33 if (!element.isValid()) {
34 return;
35 }
36
37 // Print out the type and title info, if any.
38 PrintIndent(indent++);
39 System.out.print("Type: " + element.getType());
40 if (element.hasTitle()) {
41 System.out.print(". Title: " + element.getTitle());
42 }
43
44 int num = element.getNumKids();
45 for (int i = 0; i < num; ++i) {
46 // Check is the kid is a leaf node (i.e. it is a ContentItem).
47 if (element.isContentItem(i)) {
48 ContentItem cont = element.getAsContentItem(i);
49 int type = cont.getType();
50
51 Page page = cont.getPage();
52
53 PrintIndent(indent);
54 System.out.print("Content Item. Part of page #" + page.getIndex());
55
56 PrintIndent(indent);
57 switch (type) {
58 case ContentItem.e_MCID:
59 case ContentItem.e_MCR:
60 System.out.print("MCID: " + cont.getMCID());
61 break;
62 case ContentItem.e_OBJR: {
63 System.out.print("OBJR ");
64 Obj ref_obj = cont.getRefObj();
65 if (ref_obj != null)
66 System.out.print("- Referenced Object#: " + ref_obj.getObjNum());
67 }
68 break;
69 default:
70 break;
71 }
72 } else { // the kid is another StructElement node.
73 ProcessStructElement(element.getAsStructElem(i), indent);
74 }
75 }
76 }
77
78 // Used in code snippet 2.
79 static void ProcessElements(ElementReader reader) throws PDFNetException {
80 Element element;
81 while ((element = reader.next()) != null) // Read page contents
82 {
83 // In this sample we process only paths & text, but the code can be
84 // extended to handle any element type.
85 int type = element.getType();
86 if (type == Element.e_path || type == Element.e_text || type == Element.e_path) {
87 switch (type) {
88 case Element.e_path: // Process path ...
89 System.out.print("\nPATH: ");
90 break;
91 case Element.e_text: // Process text ...
92 System.out.print("\nTEXT: " + element.getTextString() + "\n");
93 break;
94 case Element.e_form: // Process form XObjects
95 System.out.print("\nFORM XObject: ");
96 //reader.FormBegin();
97 //ProcessElements(reader);
98 //reader.End();
99 break;
100 }
101
102 // Check if the element is associated with any structural element.
103 // Content items are leaf nodes of the structure tree.
104 SElement struct_parent = element.getParentStructElement();
105 if (struct_parent.isValid()) {
106 // Print out the parent structural element's type, title, and object number.
107 System.out.print(" Type: " + struct_parent.getType()
108 + ", MCID: " + element.getStructMCID());
109 if (struct_parent.hasTitle()) {
110 System.out.print(". Title: " + struct_parent.getTitle());
111 }
112 System.out.print(", Obj#: " + struct_parent.getSDFObj().getObjNum());
113 }
114 }
115 }
116 }
117
118 // Used in code snippet 3.
119 //typedef map<int, string> MCIDPageMap;
120 //typedef map<int, MCIDPageMap> MCIDDocMap;
121
122 // Used in code snippet 3.
123 static void ProcessElements2(ElementReader reader, Map<Integer, String> mcid_page_map) throws PDFNetException {
124 Element element;
125 while ((element = reader.next()) != null) // Read page contents
126 {
127 // In this sample we process only text, but the code can be extended
128 // to handle paths, images, or any other Element type.
129 int mcid = element.getStructMCID();
130 Integer key_mcid = new Integer(mcid);
131 if (mcid >= 0 && element.getType() == Element.e_text) {
132 String val = element.getTextString();
133 if (mcid_page_map.containsKey(key_mcid))
134 mcid_page_map.put(key_mcid, ((String) (mcid_page_map.get(key_mcid)) + val));
135 else mcid_page_map.put(key_mcid, val);
136 }
137 }
138 }
139
140 // Used in code snippet 3.
141 static void ProcessStructElement2(SElement element, Map<Integer, Map<Integer, String>> mcid_doc_map, int indent) throws PDFNetException {
142 if (!element.isValid()) {
143 return;
144 }
145
146 // Print out the type and title info, if any.
147 PrintIndent(indent);
148 System.out.print("<" + element.getType());
149 if (element.hasTitle()) {
150 System.out.print(" title=\"" + element.getTitle() + "\"");
151 }
152 System.out.print(">");
153
154 int num = element.getNumKids();
155 for (int i = 0; i < num; ++i) {
156 if (element.isContentItem(i)) {
157 ContentItem cont = element.getAsContentItem(i);
158 if (cont.getType() == ContentItem.e_MCID) {
159 int page_num = cont.getPage().getIndex();
160 Integer page_num_key = new Integer(page_num);
161 if (mcid_doc_map.containsKey(page_num_key)) {
162 Map<Integer, String> mcid_page_map = mcid_doc_map.get(page_num_key);
163 Integer mcid_key = new Integer(cont.getMCID());
164 if (mcid_page_map.containsKey(mcid_key)) {
165 System.out.print(mcid_page_map.get(mcid_key));
166 }
167 }
168 }
169 } else { // the kid is another StructElement node.
170 ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1);
171 }
172 }
173
174 PrintIndent(indent);
175 System.out.print("</" + element.getType() + ">");
176 }
177
178
179 /**
180 * @param args
181 */
182 public static void main(String[] args) {
183 PDFNet.initialize(PDFTronLicense.Key());
184
185 // Relative path to the folder containing test files.
186 String input_path = "../../TestFiles/";
187 String output_path = "../../TestFiles/Output/";
188
189 try (PDFDoc doc = new PDFDoc((input_path + "tagged.pdf"))) // Extract logical structure from a PDF document
190 {
191 doc.initSecurityHandler();
192
193 System.out.println("____________________________________________________________");
194 System.out.println("Sample 1 - Traverse logical structure tree...");
195 {
196 STree tree = doc.getStructTree();
197 if (tree.isValid()) {
198 System.out.println("Document has a StructTree root.");
199
200 for (int i = 0; i < tree.getNumKids(); ++i) {
201 // Recursively get structure info for all all child elements.
202 ProcessStructElement(tree.getKid(i), 0);
203 }
204 } else {
205 System.out.println("This document does not contain any logical structure.");
206 }
207 }
208 System.out.println("\nDone 1.");
209
210 System.out.println("____________________________________________________________");
211 System.out.println("Sample 2 - Get parent logical structure elements from");
212 System.out.println("layout elements.");
213 {
214 ElementReader reader = new ElementReader();
215 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
216 reader.begin(itr.next());
217 ProcessElements(reader);
218 reader.end();
219 }
220 }
221 System.out.println("\nDone 2.");
222
223 System.out.println("____________________________________________________________");
224 System.out.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
225 {
226 //A map which maps page numbers(as Integers)
227 //to page Maps(which map from struct mcid(as Integers) to
228 //text Strings)
229 Map<Integer, Map<Integer, String>> mcid_doc_map = new TreeMap<Integer, Map<Integer, String>>();
230 ElementReader reader = new ElementReader();
231 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
232 Page current = itr.next();
233 reader.begin(current);
234 Map<Integer, String> page_mcid_map = new TreeMap<Integer, String>();
235 mcid_doc_map.put(new Integer(current.getIndex()), page_mcid_map);
236 ProcessElements2(reader, page_mcid_map);
237 reader.end();
238 }
239
240 STree tree = doc.getStructTree();
241 if (tree.isValid()) {
242 for (int i = 0; i < tree.getNumKids(); ++i) {
243 ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0);
244 }
245 }
246 }
247 System.out.println("\nDone 3.");
248 doc.save((output_path + "LogicalStructure.pdf"), SDFDoc.SaveMode.LINEARIZED, null);
249 } catch (Exception e) {
250 e.printStackTrace();
251 }
252
253 PDFNet.terminate();
254 }
255
256}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales