PDF Logical Structure Reader - Node.js (JavaScript) Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using System.Collections;
8
9using pdftron;
10using pdftron.Common;
11using pdftron.Filters;
12using pdftron.SDF;
13using pdftron.PDF;
14using pdftron.PDF.Struct;
15
16
17namespace LogicalStructureTestCS
18{
19 //---------------------------------------------------------------------------------------
20 // This sample explores the structure and content of a tagged PDF document and dumps
21 // the structure information to the console window.
22 //
23 // In tagged PDF documents StructTree acts as a central repository for information
24 // related to a PDF document's logical structure. The tree consists of StructElement-s
25 // and ContentItem-s which are leaf nodes of the structure tree.
26 //
27 // The sample can be extended to access and extract the marked-content elements such
28 // as text and images.
29 //---------------------------------------------------------------------------------------
30 class Class1
31 {
32 static void PrintIndent(int indent) { Console.WriteLine(); for (int i=0; i<indent; ++i) Console.Write(" "); }
33
34 // Used in code snippet 1.
35 static void ProcessStructElement(SElement element, int indent)
36 {
37 if (!element.IsValid()) {
38 return;
39 }
40
41 // Print out the type and title info, if any.
42 PrintIndent(indent++);
43 Console.Write("Type: " + element.GetType());
44 if (element.HasTitle()) {
45 Console.Write(". Title: "+ element.GetTitle());
46 }
47
48 int num = element.GetNumKids();
49 for (int i=0; i<num; ++i)
50 {
51 // Check is the kid is a leaf node (i.e. it is a ContentItem).
52 if (element.IsContentItem(i)) {
53 ContentItem cont = element.GetAsContentItem(i);
54 ContentItem.Type type = cont.GetType();
55
56 Page page = cont.GetPage();
57
58 PrintIndent(indent);
59 Console.Write("Content Item. Part of page #" + page.GetIndex());
60
61 PrintIndent(indent);
62 switch (type) {
63 case ContentItem.Type.e_MCID:
64 case ContentItem.Type.e_MCR:
65 Console.Write("MCID: " + cont.GetMCID());
66 break;
67 case ContentItem.Type.e_OBJR:
68 {
69 Console.Write("OBJR ");
70 Obj ref_obj = cont.GetRefObj();
71 if (ref_obj!=null)
72 Console.Write("- Referenced Object#: " + ref_obj.GetObjNum());
73 }
74 break;
75 default:
76 break;
77 }
78 }
79 else { // the kid is another StructElement node.
80 ProcessStructElement(element.GetAsStructElem(i), indent);
81 }
82 }
83 }
84
85 // Used in code snippet 2.
86 static void ProcessElements(ElementReader reader)
87 {
88 Element element;
89 while ((element = reader.Next())!=null) // Read page contents
90 {
91 // In this sample we process only paths & text, but the code can be
92 // extended to handle any element type.
93 Element.Type type = element.GetType();
94 if (type == Element.Type.e_path || type == Element.Type.e_text || type == Element.Type.e_path)
95 {
96 switch (type) {
97 case Element.Type.e_path: // Process path ...
98 Console.WriteLine();
99 Console.Write("PATH: ");
100 break;
101 case Element.Type.e_text: // Process text ...
102 Console.WriteLine();
103 Console.WriteLine("TEXT: " + element.GetTextString());
104 break;
105 case Element.Type.e_form: // Process form XObjects
106 Console.WriteLine();
107 Console.Write("FORM XObject: ");
108 //reader.FormBegin();
109 //ProcessElements(reader);
110 //reader.End();
111 break;
112 }
113
114 // Check if the element is associated with any structural element.
115 // Content items are leaf nodes of the structure tree.
116 SElement struct_parent = element.GetParentStructElement();
117 if (struct_parent.IsValid()) {
118 // Print out the parent structural element's type, title, and object number.
119 Console.Write(" Type: " + struct_parent.GetType()
120 + ", MCID: " + element.GetStructMCID());
121 if (struct_parent.HasTitle()) {
122 Console.Write(". Title: "+ struct_parent.GetTitle());
123 }
124 Console.Write(", Obj#: " + struct_parent.GetSDFObj().GetObjNum());
125 }
126 }
127 }
128 }
129
130 // Used in code snippet 3.
131 //typedef map<int, string> MCIDPageMap;
132 //typedef map<int, MCIDPageMap> MCIDDocMap;
133
134 // Used in code snippet 3.
135 static void ProcessElements2(ElementReader reader, Hashtable mcid_page_map)
136 {
137 Element element;
138 while ((element = reader.Next())!=null) // Read page contents
139 {
140 // In this sample we process only text, but the code can be extended
141 // to handle paths, images, or any other Element type.
142 int mcid = element.GetStructMCID();
143 if (mcid>= 0 && element.GetType() == Element.Type.e_text) {
144 String val = element.GetTextString();
145 if (mcid_page_map.ContainsKey(mcid)) mcid_page_map[mcid] = ((String)(mcid_page_map[mcid])+ val);
146 else mcid_page_map.Add(mcid, val);
147 }
148 }
149 }
150
151 // Used in code snippet 3.
152 static void ProcessStructElement2(SElement element, Hashtable mcid_doc_map, int indent)
153 {
154 if (!element.IsValid()) {
155 return;
156 }
157
158 // Print out the type and title info, if any.
159 PrintIndent(indent);
160 Console.Write("<" + element.GetType());
161 if (element.HasTitle()) {
162 Console.Write(" title=\""+ element.GetTitle() + "\"");
163 }
164 Console.Write(">");
165
166 int num = element.GetNumKids();
167 for (int i=0; i<num; ++i)
168 {
169 if (element.IsContentItem(i)) {
170 ContentItem cont = element.GetAsContentItem(i);
171 if (cont.GetType() == ContentItem.Type.e_MCID) {
172 int page_num = cont.GetPage().GetIndex();
173 if (mcid_doc_map.ContainsKey(page_num)) {
174 Hashtable mcid_page_map = (Hashtable)(mcid_doc_map[page_num]);
175 int mcid = cont.GetMCID();
176 if (mcid_page_map.ContainsKey(mcid)) {
177 Console.Write(mcid_page_map[mcid]);
178 }
179 }
180 }
181 }
182 else { // the kid is another StructElement node.
183 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1);
184 }
185 }
186
187 PrintIndent(indent);
188 Console.Write("</" + element.GetType() + ">");
189 }
190
191 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
192 static Class1() {}
193
194 /// <summary>
195 /// The main entry point for the application.
196 /// </summary>
197 static void Main(string[] args)
198 {
199 PDFNet.Initialize(PDFTronLicense.Key);
200 // Relative path to the folder containing test files.
201 string input_path = "../../../../TestFiles/";
202 string output_path = "../../../../TestFiles/Output/";
203
204 try // Extract logical structure from a PDF document
205 {
206 using (PDFDoc doc = new PDFDoc(input_path + "tagged.pdf"))
207 {
208 doc.InitSecurityHandler();
209
210 bool example1 = true;
211 bool example2 = true;
212 bool example3 = true;
213
214 if (example1)
215 {
216 Console.WriteLine("____________________________________________________________");
217 Console.WriteLine("Sample 1 - Traverse logical structure tree...");
218
219 STree tree = doc.GetStructTree();
220 if (tree.IsValid())
221 {
222 Console.WriteLine("Document has a StructTree root.");
223 for (int i=0; i<tree.GetNumKids(); ++i)
224 {
225 // Recursively get structure info for all all child elements.
226 ProcessStructElement(tree.GetKid(i), 0);
227 }
228 }
229 else
230 {
231 Console.WriteLine("This document does not contain any logical structure.");
232 }
233
234 Console.WriteLine();
235 Console.WriteLine("Done 1.");
236 }
237
238 if (example2)
239 {
240 Console.WriteLine("____________________________________________________________");
241 Console.WriteLine("Sample 2 - Get parent logical structure elements from");
242 Console.WriteLine("layout elements.");
243
244 ElementReader reader=new ElementReader();
245 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
246 {
247 reader.Begin(itr.Current());
248 ProcessElements(reader);
249 reader.End();
250 }
251 Console.WriteLine();
252 Console.WriteLine("Done 2.");
253 }
254
255 if (example3)
256 {
257 Console.WriteLine("____________________________________________________________");
258 Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
259
260 //A map which maps page numbers(as Integers)
261 //to page Maps(which map from struct mcid(as Integers) to
262 //text Strings)
263 Hashtable mcid_doc_map=new Hashtable();
264 ElementReader reader=new ElementReader();
265 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
266 {
267 Page pg = itr.Current();
268 reader.Begin(pg);
269 Hashtable page_mcid_map=new Hashtable();
270 mcid_doc_map.Add(pg.GetIndex(), page_mcid_map);
271 ProcessElements2(reader, page_mcid_map);
272 reader.End();
273 }
274
275 STree tree = doc.GetStructTree();
276 if (tree.IsValid())
277 {
278 for (int i=0; i<tree.GetNumKids(); ++i)
279 {
280 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
281 }
282 }
283 Console.WriteLine();
284 Console.WriteLine("Done 3.");
285 }
286
287 doc.Save(output_path + "LogicalStructure.pdf", 0);
288 }
289 }
290 catch (PDFNetException e)
291 {
292 Console.WriteLine(e.Message);
293 }
294 PDFNet.Terminate();
295 }
296 }
297}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales
PDF Logical Structure Reader with Server SDK in Node.js (JavaScript) | Apryse documentation