LogicalStructure

Sample JavaScript code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Web SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// This sample explores the structure and content of a tagged PDF document and dumps
8// the structure information to the console window.
9//
10// In tagged PDF documents StructTree acts as a central repository for information
11// related to a PDF document's logical structure. The tree consists of StructElement-s
12// and ContentItem-s which are leaf nodes of the structure tree.
13//
14// The sample can be extended to access and extract the marked-content elements such
15// as text and images.
16//---------------------------------------------------------------------------------------
17
18(exports => {
19
20
21
22
23
24
25
26
27
28
29 exports.runLogicalStructureTest = () => {
30 const PDFNet = exports.Core.PDFNet;
31
32 const PrintAndIndent = (printState, indent) => {
33 if (printState.str) {
34 const indentStr = ' '.repeat(printState.indent * 2);
35 console.log(indentStr + printState.str);
36 }
37 printState.str = '';
38 printState.indent = indent;
39 };
40
41 // Read the structure recursively
42 const ReadDocumentStructure = async (element, parent) => {
43 if (!(await element.isValid())) {
44 return;
45 }
46
47 const [type, numKids] = await Promise.all([element.getType(), element.getNumKids()]);
48
49 const elementData = {
50 type,
51 numKids,
52 isLeaf: false,
53 children: [],
54 };
55
56 if (await element.hasTitle()) {
57 elementData.title = await element.getTitle();
58 }
59
60 parent.children.push(elementData);
61
62 for (let i = 0; i < elementData.numKids; ++i) {
63 // Check is the kid is a leaf node (i.e. it is a ContentItem).
64 const contentItem = {
65 isLeaf: await element.isContentItem(i),
66 };
67 if (contentItem.isLeaf) {
68 const cont = await element.getAsContentItem(i);
69 const [type, page] = await Promise.all([cont.getType(), cont.getPage()]);
70 const pageNum = await page.getIndex();
71
72 contentItem.type = type;
73 contentItem.pageNum = pageNum;
74
75 switch (type) {
76 case PDFNet.ContentItem.Type.e_MCID:
77 case PDFNet.ContentItem.Type.e_MCR:
78 contentItem.mcid = await cont.getMCID();
79 break;
80 case PDFNet.ContentItem.Type.e_OBJR:
81 {
82 const refObj = await cont.getRefObj();
83 if (refObj) {
84 contentItem.objNum = refObj.getObjNum();
85 }
86 }
87 break;
88 default:
89 break;
90 }
91 elementData.children.push(contentItem);
92 } else {
93 // the kid is another StructElement node.
94 await ReadDocumentStructure(await element.getAsStructElem(i), elementData);
95 }
96 }
97 };
98
99 // Read the elements sequentially with a reader
100 const ReadElements = async doc => {
101 const elements = [];
102 const reader = await PDFNet.ElementReader.create();
103 for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
104 const page = await itr.current();
105 reader.beginOnPage(page);
106 const pageNum = await page.getIndex();
107 let element;
108 while ((element = await reader.next())) {
109 // Read page contents
110 const readElement = {
111 type: await element.getType(),
112 pageNum,
113 };
114 if (readElement.type === PDFNet.Element.Type.e_path || readElement.type === PDFNet.Element.Type.e_text || readElement.type === PDFNet.Element.Type.e_path) {
115 readElement.text = await element.getTextString();
116 // Check if the element is associated with any structural element.
117 // Content items are leaf nodes of the structure tree.
118 const structParent = await element.getParentStructElement();
119 readElement.isValid = await structParent.isValid();
120 if (readElement.isValid) {
121 readElement.structType = await structParent.getType();
122 readElement.mcid = await element.getStructMCID();
123 if (await structParent.hasTitle()) {
124 readElement.title = await structParent.getTitle();
125 }
126 readElement.objNum = await (await structParent.getSDFObj()).getObjNum();
127 }
128 elements.push(readElement);
129 }
130 }
131 reader.end();
132 }
133 return elements;
134 };
135
136 // Used in code snippet 1.
137 const ProcessStructElement = (element, indent, printState) => {
138 // Print out the type and title info, if any.
139 PrintAndIndent(printState, indent++);
140 printState.str += `Type: ${element.type}${element.title ? `. Title: ${element.title}` : ''}`;
141
142 for (let i = 0; i < element.numKids; ++i) {
143 const child = element.children[i];
144 // Check is the kid is a leaf node (i.e. it is a ContentItem).
145 if (child.isLeaf) {
146 PrintAndIndent(printState, indent);
147 printState.str += `Content Item. Part of page #${child.pageNum}`;
148
149 PrintAndIndent(printState, indent);
150 switch (child.type) {
151 case PDFNet.ContentItem.Type.e_MCID:
152 case PDFNet.ContentItem.Type.e_MCR:
153 printState.str += `MCID: ${child.mcid}`;
154 break;
155 case PDFNet.ContentItem.Type.e_OBJR:
156 printState.str += 'OBJR ';
157 if (child.objNum) {
158 printState.str += `- Referenced Object#: ${child.objNum}`;
159 }
160 break;
161 default:
162 break;
163 }
164 } else {
165 // the kid is another StructElement node.
166 ProcessStructElement(child, indent, printState);
167 }
168 }
169 };
170
171 // Used in code snippet 2.
172 const ProcessElementsArray = (elementsArray, printState) => {
173 for (let i = 0; i < elementsArray.length; i++) {
174 // Read page contents
175 const element = elementsArray[i];
176 // In this sample we process only paths & text, but the code can be
177 // extended to handle any element type.
178 if (element.type === PDFNet.Element.Type.e_path || element.type === PDFNet.Element.Type.e_text || element.type === PDFNet.Element.Type.e_path) {
179 switch (element.type) {
180 case PDFNet.Element.Type.e_path: // Process path ...
181 printState.str += '\nPATH: ';
182 break;
183 case PDFNet.Element.Type.e_text: // Process text ...
184 printState.str += `\nTEXT: ${element.text}\n`;
185 break;
186 case PDFNet.Element.Type.e_form: // Process form XObjects
187 printState.str += '\nFORM XObject: ';
188 // reader.formBegin();
189 // await ProcessElements(reader);
190 // reader.end();
191 break;
192 }
193
194 if (element.isValid) {
195 // Print out the parent structural element's type, title, and object number.
196 printState.str += ` Type: ${element.structType}, MCID: ${element.mcid}`;
197 if (element.title) {
198 printState.str += `. Title: ${element.title}`;
199 }
200 printState.str += `, Obj#: ${element.objNum}`;
201 }
202 }
203 }
204 };
205
206 // Used in code snippet 3.
207 const CreateMCIDDocMap = elementsArray => {
208 const mcidDocMap = {};
209 for (let i = 0; i < elementsArray.length; i++) {
210 const element = elementsArray[i];
211 if (!mcidDocMap[element.pageNum]) {
212 mcidDocMap[element.pageNum] = {};
213 }
214 const pageMcidMap = mcidDocMap[element.pageNum];
215 if (element.mcid >= 0 && element.type === PDFNet.Element.Type.e_text) {
216 if (element.mcid in pageMcidMap) {
217 pageMcidMap[element.mcid] += element.text;
218 } else {
219 pageMcidMap[element.mcid] = element.text;
220 }
221 }
222 }
223 return mcidDocMap;
224 };
225
226 // Used in code snippet 3.
227 const ProcessStructElement2 = (element, mcidDocMap, indent, printState) => {
228 // Print out the type and title info, if any.
229 PrintAndIndent(printState, indent);
230 printState.str += `<${element.type}${element.title ? ` title="${element.title}"` : ''}>`;
231
232 for (let i = 0; i < element.numKids; ++i) {
233 const child = element.children[i];
234 if (child.isLeaf) {
235 if (child.type === PDFNet.ContentItem.Type.e_MCID) {
236 const pageNum = child.pageNum;
237 const mcidPageMap = mcidDocMap[pageNum];
238 if (mcidPageMap) {
239 const mcid = child.mcid;
240 if (mcid in mcidPageMap) {
241 printState.str += mcidPageMap[mcid];
242 }
243 }
244 }
245 } else {
246 // the kid is another StructElement node.
247 ProcessStructElement2(child, mcidDocMap, indent + 1, printState);
248 }
249 }
250
251 PrintAndIndent(printState, indent);
252 printState.str += `</${element.type}>`;
253 };
254
255 const main = async () => {
256 // Relative path to the folder containing test files.
257 const inputPath = '../TestFiles/';
258 const printState = { str: '' };
259 try {
260 // Extract logical structure from a PDF document
261 const doc = await PDFNet.PDFDoc.createFromURL(`${inputPath}tagged.pdf`);
262 doc.initSecurityHandler();
263
264 const tree = await doc.getStructTree();
265 const hasValidTree = await tree.isValid();
266 const numKidsFromRoot = await tree.getNumKids();
267 const structRoot = {
268 children: [],
269 };
270 let elementsArray = [];
271
272 if (hasValidTree) {
273 console.log('Document has a StructTree root.');
274 const [, elementsArr] = await Promise.all([
275 new Promise(async res => {
276 for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
277 // Recursively get structure info for all child elements.
278 await ReadDocumentStructure(await tree.getKid(i), structRoot);
279 }
280 res();
281 }),
282 ReadElements(doc),
283 ]);
284 elementsArray = elementsArr;
285 } else {
286 console.log('This document does not contain any logical structure.');
287 }
288
289 console.log('____________________________________________________________');
290 console.log('Sample 1 - Traverse logical structure tree...');
291 for (let i = 0; i < structRoot.children.length; ++i) {
292 // Recursively get structure info for all child elements.
293 ProcessStructElement(structRoot.children[i], 0, printState);
294 }
295 PrintAndIndent(printState, 0);
296 console.log('Done 1.');
297
298 console.log('____________________________________________________________');
299 console.log('Sample 2 - Get parent logical structure elements from');
300 console.log('layout elements.');
301 ProcessElementsArray(elementsArray, printState);
302 PrintAndIndent(printState, 0);
303 console.log('Done 2.');
304
305 console.log('____________________________________________________________');
306 console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
307 {
308 const mcidDocMap = CreateMCIDDocMap(elementsArray);
309 if (hasValidTree) {
310 for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
311 ProcessStructElement2(structRoot.children[i], mcidDocMap, 0, printState);
312 }
313 }
314 }
315 PrintAndIndent(printState, 0);
316 console.log('Done 3.');
317 const docBuffer = await doc.saveMemoryBuffer(0);
318 saveBufferAsPDFDoc(docBuffer, 'bookmark.pdf');
319 } catch (err) {
320 console.log(err);
321 }
322 };
323
324 // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
325 PDFNet.runWithCleanup(main);
326 };
327})(window);
328// eslint-disable-next-line spaced-comment
329//# sourceURL=LogicalStructureTest.js

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales