Sample JavaScript code for using WebViewer to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our full PDF Data Extraction SDK Capabilities.
To start your free trial, get stated with WebViewer.
This sample works with Full-API for WebViewer.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// This sample explores the structure and content of a tagged PDF document and dumps
8// the structure information to the console window.
9//
10// In tagged PDF documents StructTree acts as a central repository for information
11// related to a PDF document's logical structure. The tree consists of StructElement-s
12// and ContentItem-s which are leaf nodes of the structure tree.
13//
14// The sample can be extended to access and extract the marked-content elements such
15// as text and images.
16//---------------------------------------------------------------------------------------
17
18(exports => {
19
20
21
22
23
24
25
26
27
28
29 exports.runLogicalStructureTest = () => {
30 const PDFNet = exports.Core.PDFNet;
31
32 const PrintAndIndent = (printState, indent) => {
33 if (printState.str) {
34 const indentStr = ' '.repeat(printState.indent * 2);
35 console.log(indentStr + printState.str);
36 }
37 printState.str = '';
38 printState.indent = indent;
39 };
40
41 // Read the structure recursively
42 const ReadDocumentStructure = async (element, parent) => {
43 if (!(await element.isValid())) {
44 return;
45 }
46
47 const [type, numKids] = await Promise.all([element.getType(), element.getNumKids()]);
48
49 const elementData = {
50 type,
51 numKids,
52 isLeaf: false,
53 children: [],
54 };
55
56 if (await element.hasTitle()) {
57 elementData.title = await element.getTitle();
58 }
59
60 parent.children.push(elementData);
61
62 for (let i = 0; i < elementData.numKids; ++i) {
63 // Check is the kid is a leaf node (i.e. it is a ContentItem).
64 const contentItem = {
65 isLeaf: await element.isContentItem(i),
66 };
67 if (contentItem.isLeaf) {
68 const cont = await element.getAsContentItem(i);
69 const [type, page] = await Promise.all([cont.getType(), cont.getPage()]);
70 const pageNum = await page.getIndex();
71
72 contentItem.type = type;
73 contentItem.pageNum = pageNum;
74
75 switch (type) {
76 case PDFNet.ContentItem.Type.e_MCID:
77 case PDFNet.ContentItem.Type.e_MCR:
78 contentItem.mcid = await cont.getMCID();
79 break;
80 case PDFNet.ContentItem.Type.e_OBJR:
81 {
82 const refObj = await cont.getRefObj();
83 if (refObj) {
84 contentItem.objNum = refObj.getObjNum();
85 }
86 }
87 break;
88 default:
89 break;
90 }
91 elementData.children.push(contentItem);
92 } else {
93 // the kid is another StructElement node.
94 await ReadDocumentStructure(await element.getAsStructElem(i), elementData);
95 }
96 }
97 };
98
99 // Read the elements sequentially with a reader
100 const ReadElements = async doc => {
101 const elements = [];
102 const reader = await PDFNet.ElementReader.create();
103 for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
104 const page = await itr.current();
105 reader.beginOnPage(page);
106 const pageNum = await page.getIndex();
107 let element;
108 while ((element = await reader.next())) {
109 // Read page contents
110 const readElement = {
111 type: await element.getType(),
112 pageNum,
113 };
114 if (readElement.type === PDFNet.Element.Type.e_path || readElement.type === PDFNet.Element.Type.e_text || readElement.type === PDFNet.Element.Type.e_path) {
115 readElement.text = await element.getTextString();
116 // Check if the element is associated with any structural element.
117 // Content items are leaf nodes of the structure tree.
118 const structParent = await element.getParentStructElement();
119 readElement.isValid = await structParent.isValid();
120 if (readElement.isValid) {
121 readElement.structType = await structParent.getType();
122 readElement.mcid = await element.getStructMCID();
123 if (await structParent.hasTitle()) {
124 readElement.title = await structParent.getTitle();
125 }
126 readElement.objNum = await (await structParent.getSDFObj()).getObjNum();
127 }
128 elements.push(readElement);
129 }
130 }
131 reader.end();
132 }
133 return elements;
134 };
135
136 // Used in code snippet 1.
137 const ProcessStructElement = (element, indent, printState) => {
138 // Print out the type and title info, if any.
139 PrintAndIndent(printState, indent++);
140 printState.str += `Type: ${element.type}${element.title ? `. Title: ${element.title}` : ''}`;
141
142 for (let i = 0; i < element.numKids; ++i) {
143 const child = element.children[i];
144 // Check is the kid is a leaf node (i.e. it is a ContentItem).
145 if (child.isLeaf) {
146 PrintAndIndent(printState, indent);
147 printState.str += `Content Item. Part of page #${child.pageNum}`;
148
149 PrintAndIndent(printState, indent);
150 switch (child.type) {
151 case PDFNet.ContentItem.Type.e_MCID:
152 case PDFNet.ContentItem.Type.e_MCR:
153 printState.str += `MCID: ${child.mcid}`;
154 break;
155 case PDFNet.ContentItem.Type.e_OBJR:
156 printState.str += 'OBJR ';
157 if (child.objNum) {
158 printState.str += `- Referenced Object#: ${child.objNum}`;
159 }
160 break;
161 default:
162 break;
163 }
164 } else {
165 // the kid is another StructElement node.
166 ProcessStructElement(child, indent, printState);
167 }
168 }
169 };
170
171 // Used in code snippet 2.
172 const ProcessElementsArray = (elementsArray, printState) => {
173 for (let i = 0; i < elementsArray.length; i++) {
174 // Read page contents
175 const element = elementsArray[i];
176 // In this sample we process only paths & text, but the code can be
177 // extended to handle any element type.
178 if (element.type === PDFNet.Element.Type.e_path || element.type === PDFNet.Element.Type.e_text || element.type === PDFNet.Element.Type.e_path) {
179 switch (element.type) {
180 case PDFNet.Element.Type.e_path: // Process path ...
181 printState.str += '\nPATH: ';
182 break;
183 case PDFNet.Element.Type.e_text: // Process text ...
184 printState.str += `\nTEXT: ${element.text}\n`;
185 break;
186 case PDFNet.Element.Type.e_form: // Process form XObjects
187 printState.str += '\nFORM XObject: ';
188 // reader.formBegin();
189 // await ProcessElements(reader);
190 // reader.end();
191 break;
192 }
193
194 if (element.isValid) {
195 // Print out the parent structural element's type, title, and object number.
196 printState.str += ` Type: ${element.structType}, MCID: ${element.mcid}`;
197 if (element.title) {
198 printState.str += `. Title: ${element.title}`;
199 }
200 printState.str += `, Obj#: ${element.objNum}`;
201 }
202 }
203 }
204 };
205
206 // Used in code snippet 3.
207 const CreateMCIDDocMap = elementsArray => {
208 const mcidDocMap = {};
209 for (let i = 0; i < elementsArray.length; i++) {
210 const element = elementsArray[i];
211 if (!mcidDocMap[element.pageNum]) {
212 mcidDocMap[element.pageNum] = {};
213 }
214 const pageMcidMap = mcidDocMap[element.pageNum];
215 if (element.mcid >= 0 && element.type === PDFNet.Element.Type.e_text) {
216 if (element.mcid in pageMcidMap) {
217 pageMcidMap[element.mcid] += element.text;
218 } else {
219 pageMcidMap[element.mcid] = element.text;
220 }
221 }
222 }
223 return mcidDocMap;
224 };
225
226 // Used in code snippet 3.
227 const ProcessStructElement2 = (element, mcidDocMap, indent, printState) => {
228 // Print out the type and title info, if any.
229 PrintAndIndent(printState, indent);
230 printState.str += `<${element.type}${element.title ? ` title="${element.title}"` : ''}>`;
231
232 for (let i = 0; i < element.numKids; ++i) {
233 const child = element.children[i];
234 if (child.isLeaf) {
235 if (child.type === PDFNet.ContentItem.Type.e_MCID) {
236 const pageNum = child.pageNum;
237 const mcidPageMap = mcidDocMap[pageNum];
238 if (mcidPageMap) {
239 const mcid = child.mcid;
240 if (mcid in mcidPageMap) {
241 printState.str += mcidPageMap[mcid];
242 }
243 }
244 }
245 } else {
246 // the kid is another StructElement node.
247 ProcessStructElement2(child, mcidDocMap, indent + 1, printState);
248 }
249 }
250
251 PrintAndIndent(printState, indent);
252 printState.str += `</${element.type}>`;
253 };
254
255 const main = async () => {
256 // Relative path to the folder containing test files.
257 const inputPath = '../TestFiles/';
258 const printState = { str: '' };
259 try {
260 // Extract logical structure from a PDF document
261 const doc = await PDFNet.PDFDoc.createFromURL(`${inputPath}tagged.pdf`);
262 doc.initSecurityHandler();
263
264 const tree = await doc.getStructTree();
265 const hasValidTree = await tree.isValid();
266 const numKidsFromRoot = await tree.getNumKids();
267 const structRoot = {
268 children: [],
269 };
270 let elementsArray = [];
271
272 if (hasValidTree) {
273 console.log('Document has a StructTree root.');
274 const [, elementsArr] = await Promise.all([
275 new Promise(async res => {
276 for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
277 // Recursively get structure info for all child elements.
278 await ReadDocumentStructure(await tree.getKid(i), structRoot);
279 }
280 res();
281 }),
282 ReadElements(doc),
283 ]);
284 elementsArray = elementsArr;
285 } else {
286 console.log('This document does not contain any logical structure.');
287 }
288
289 console.log('____________________________________________________________');
290 console.log('Sample 1 - Traverse logical structure tree...');
291 for (let i = 0; i < structRoot.children.length; ++i) {
292 // Recursively get structure info for all child elements.
293 ProcessStructElement(structRoot.children[i], 0, printState);
294 }
295 PrintAndIndent(printState, 0);
296 console.log('Done 1.');
297
298 console.log('____________________________________________________________');
299 console.log('Sample 2 - Get parent logical structure elements from');
300 console.log('layout elements.');
301 ProcessElementsArray(elementsArray, printState);
302 PrintAndIndent(printState, 0);
303 console.log('Done 2.');
304
305 console.log('____________________________________________________________');
306 console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
307 {
308 const mcidDocMap = CreateMCIDDocMap(elementsArray);
309 if (hasValidTree) {
310 for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
311 ProcessStructElement2(structRoot.children[i], mcidDocMap, 0, printState);
312 }
313 }
314 }
315 PrintAndIndent(printState, 0);
316 console.log('Done 3.');
317 const docBuffer = await doc.saveMemoryBuffer(0);
318 saveBufferAsPDFDoc(docBuffer, 'bookmark.pdf');
319 } catch (err) {
320 console.log(err);
321 }
322 };
323
324 // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
325 PDFNet.runWithCleanup(main);
326 };
327})(window);
328// eslint-disable-next-line spaced-comment
329//# sourceURL=LogicalStructureTest.js
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales