Sample JavaScript code for using WebViewer to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree.
Learn more about our full PDF Data Extraction SDK Capabilities.
Step 1: Follow get started in your preferred web stack for WebViewer
Step 2: Enable the full API by passing the fullAPI option into the WebViewer constructor
Step 3: Add the sample code provided in this guide
This full sample is one of many included in the manual download of WebViewer.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// This sample explores the structure and content of a tagged PDF document and dumps
8// the structure information to the console window.
9//
10// In tagged PDF documents StructTree acts as a central repository for information
11// related to a PDF document's logical structure. The tree consists of StructElement-s
12// and ContentItem-s which are leaf nodes of the structure tree.
13//
14// The sample can be extended to access and extract the marked-content elements such
15// as text and images.
16//---------------------------------------------------------------------------------------
17
18(exports => {
19 exports.runLogicalStructureTest = () => {
20 const PDFNet = exports.Core.PDFNet;
21
22 const PrintAndIndent = (printState, indent) => {
23 if (printState.str) {
24 const indentStr = ' '.repeat(printState.indent * 2);
25 console.log(indentStr + printState.str);
26 }
27 printState.str = '';
28 printState.indent = indent;
29 };
30
31 // Read the structure recursively
32 const ReadDocumentStructure = async (element, parent) => {
33 if (!(await element.isValid())) {
34 return;
35 }
36
37 const [type, numKids] = await Promise.all([element.getType(), element.getNumKids()]);
38
39 const elementData = {
40 type,
41 numKids,
42 isLeaf: false,
43 children: [],
44 };
45
46 if (await element.hasTitle()) {
47 elementData.title = await element.getTitle();
48 }
49
50 parent.children.push(elementData);
51
52 for (let i = 0; i < elementData.numKids; ++i) {
53 // Check is the kid is a leaf node (i.e. it is a ContentItem).
54 const contentItem = {
55 isLeaf: await element.isContentItem(i),
56 };
57 if (contentItem.isLeaf) {
58 const cont = await element.getAsContentItem(i);
59 const [type, page] = await Promise.all([cont.getType(), cont.getPage()]);
60 const pageNum = await page.getIndex();
61
62 contentItem.type = type;
63 contentItem.pageNum = pageNum;
64
65 switch (type) {
66 case PDFNet.ContentItem.Type.e_MCID:
67 case PDFNet.ContentItem.Type.e_MCR:
68 contentItem.mcid = await cont.getMCID();
69 break;
70 case PDFNet.ContentItem.Type.e_OBJR:
71 {
72 const refObj = await cont.getRefObj();
73 if (refObj) {
74 contentItem.objNum = refObj.getObjNum();
75 }
76 }
77 break;
78 default:
79 break;
80 }
81 elementData.children.push(contentItem);
82 } else {
83 // the kid is another StructElement node.
84 await ReadDocumentStructure(await element.getAsStructElem(i), elementData);
85 }
86 }
87 };
88
89 // Read the elements sequentially with a reader
90 const ReadElements = async doc => {
91 const elements = [];
92 const reader = await PDFNet.ElementReader.create();
93 for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
94 const page = await itr.current();
95 reader.beginOnPage(page);
96 const pageNum = await page.getIndex();
97 let element;
98 while ((element = await reader.next())) {
99 // Read page contents
100 const readElement = {
101 type: await element.getType(),
102 pageNum,
103 };
104 if (readElement.type === PDFNet.Element.Type.e_path || readElement.type === PDFNet.Element.Type.e_text || readElement.type === PDFNet.Element.Type.e_path) {
105 readElement.text = await element.getTextString();
106 // Check if the element is associated with any structural element.
107 // Content items are leaf nodes of the structure tree.
108 const structParent = await element.getParentStructElement();
109 readElement.isValid = await structParent.isValid();
110 if (readElement.isValid) {
111 readElement.structType = await structParent.getType();
112 readElement.mcid = await element.getStructMCID();
113 if (await structParent.hasTitle()) {
114 readElement.title = await structParent.getTitle();
115 }
116 readElement.objNum = await (await structParent.getSDFObj()).getObjNum();
117 }
118 elements.push(readElement);
119 }
120 }
121 reader.end();
122 }
123 return elements;
124 };
125
126 // Used in code snippet 1.
127 const ProcessStructElement = (element, indent, printState) => {
128 // Print out the type and title info, if any.
129 PrintAndIndent(printState, indent++);
130 printState.str += `Type: ${element.type}${element.title ? `. Title: ${element.title}` : ''}`;
131
132 for (let i = 0; i < element.numKids; ++i) {
133 const child = element.children[i];
134 // Check is the kid is a leaf node (i.e. it is a ContentItem).
135 if (child.isLeaf) {
136 PrintAndIndent(printState, indent);
137 printState.str += `Content Item. Part of page #${child.pageNum}`;
138
139 PrintAndIndent(printState, indent);
140 switch (child.type) {
141 case PDFNet.ContentItem.Type.e_MCID:
142 case PDFNet.ContentItem.Type.e_MCR:
143 printState.str += `MCID: ${child.mcid}`;
144 break;
145 case PDFNet.ContentItem.Type.e_OBJR:
146 printState.str += 'OBJR ';
147 if (child.objNum) {
148 printState.str += `- Referenced Object#: ${child.objNum}`;
149 }
150 break;
151 default:
152 break;
153 }
154 } else {
155 // the kid is another StructElement node.
156 ProcessStructElement(child, indent, printState);
157 }
158 }
159 };
160
161 // Used in code snippet 2.
162 const ProcessElementsArray = (elementsArray, printState) => {
163 for (let i = 0; i < elementsArray.length; i++) {
164 // Read page contents
165 const element = elementsArray[i];
166 // In this sample we process only paths & text, but the code can be
167 // extended to handle any element type.
168 if (element.type === PDFNet.Element.Type.e_path || element.type === PDFNet.Element.Type.e_text || element.type === PDFNet.Element.Type.e_path) {
169 switch (element.type) {
170 case PDFNet.Element.Type.e_path: // Process path ...
171 printState.str += '\nPATH: ';
172 break;
173 case PDFNet.Element.Type.e_text: // Process text ...
174 printState.str += `\nTEXT: ${element.text}\n`;
175 break;
176 case PDFNet.Element.Type.e_form: // Process form XObjects
177 printState.str += '\nFORM XObject: ';
178 // reader.formBegin();
179 // await ProcessElements(reader);
180 // reader.end();
181 break;
182 }
183
184 if (element.isValid) {
185 // Print out the parent structural element's type, title, and object number.
186 printState.str += ` Type: ${element.structType}, MCID: ${element.mcid}`;
187 if (element.title) {
188 printState.str += `. Title: ${element.title}`;
189 }
190 printState.str += `, Obj#: ${element.objNum}`;
191 }
192 }
193 }
194 };
195
196 // Used in code snippet 3.
197 const CreateMCIDDocMap = elementsArray => {
198 const mcidDocMap = {};
199 for (let i = 0; i < elementsArray.length; i++) {
200 const element = elementsArray[i];
201 if (!mcidDocMap[element.pageNum]) {
202 mcidDocMap[element.pageNum] = {};
203 }
204 const pageMcidMap = mcidDocMap[element.pageNum];
205 if (element.mcid >= 0 && element.type === PDFNet.Element.Type.e_text) {
206 if (element.mcid in pageMcidMap) {
207 pageMcidMap[element.mcid] += element.text;
208 } else {
209 pageMcidMap[element.mcid] = element.text;
210 }
211 }
212 }
213 return mcidDocMap;
214 };
215
216 // Used in code snippet 3.
217 const ProcessStructElement2 = (element, mcidDocMap, indent, printState) => {
218 // Print out the type and title info, if any.
219 PrintAndIndent(printState, indent);
220 printState.str += `<${element.type}${element.title ? ` title="${element.title}"` : ''}>`;
221
222 for (let i = 0; i < element.numKids; ++i) {
223 const child = element.children[i];
224 if (child.isLeaf) {
225 if (child.type === PDFNet.ContentItem.Type.e_MCID) {
226 const pageNum = child.pageNum;
227 const mcidPageMap = mcidDocMap[pageNum];
228 if (mcidPageMap) {
229 const mcid = child.mcid;
230 if (mcid in mcidPageMap) {
231 printState.str += mcidPageMap[mcid];
232 }
233 }
234 }
235 } else {
236 // the kid is another StructElement node.
237 ProcessStructElement2(child, mcidDocMap, indent + 1, printState);
238 }
239 }
240
241 PrintAndIndent(printState, indent);
242 printState.str += `</${element.type}>`;
243 };
244
245 const main = async () => {
246 // Relative path to the folder containing test files.
247 const inputPath = '../TestFiles/';
248 const printState = { str: '' };
249 try {
250 // Extract logical structure from a PDF document
251 const doc = await PDFNet.PDFDoc.createFromURL(`${inputPath}tagged.pdf`);
252 doc.initSecurityHandler();
253
254 const tree = await doc.getStructTree();
255 const hasValidTree = await tree.isValid();
256 const numKidsFromRoot = await tree.getNumKids();
257 const structRoot = {
258 children: [],
259 };
260 let elementsArray = [];
261
262 if (hasValidTree) {
263 console.log('Document has a StructTree root.');
264 const [, elementsArr] = await Promise.all([
265 new Promise(async res => {
266 for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
267 // Recursively get structure info for all child elements.
268 await ReadDocumentStructure(await tree.getKid(i), structRoot);
269 }
270 res();
271 }),
272 ReadElements(doc),
273 ]);
274 elementsArray = elementsArr;
275 } else {
276 console.log('This document does not contain any logical structure.');
277 }
278
279 console.log('____________________________________________________________');
280 console.log('Sample 1 - Traverse logical structure tree...');
281 for (let i = 0; i < structRoot.children.length; ++i) {
282 // Recursively get structure info for all child elements.
283 ProcessStructElement(structRoot.children[i], 0, printState);
284 }
285 PrintAndIndent(printState, 0);
286 console.log('Done 1.');
287
288 console.log('____________________________________________________________');
289 console.log('Sample 2 - Get parent logical structure elements from');
290 console.log('layout elements.');
291 ProcessElementsArray(elementsArray, printState);
292 PrintAndIndent(printState, 0);
293 console.log('Done 2.');
294
295 console.log('____________________________________________________________');
296 console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
297 {
298 const mcidDocMap = CreateMCIDDocMap(elementsArray);
299 if (hasValidTree) {
300 for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
301 ProcessStructElement2(structRoot.children[i], mcidDocMap, 0, printState);
302 }
303 }
304 }
305 PrintAndIndent(printState, 0);
306 console.log('Done 3.');
307 const docBuffer = await doc.saveMemoryBuffer(0);
308 saveBufferAsPDFDoc(docBuffer, 'bookmark.pdf');
309 } catch (err) {
310 console.log(err);
311 }
312 };
313
314 // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
315 PDFNet.runWithCleanup(main);
316 };
317})(window);
318// eslint-disable-next-line spaced-comment
319//# sourceURL=LogicalStructureTest.js
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales