Logical Structure Reader - PDF Sample Code

Requirements
Sample JavaScript code for using WebViewer to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree.
Learn more about our full PDF Data Extraction SDK Capabilities.
Implementation steps

Step 1: Follow get started in your preferred web stack for WebViewer
Step 2: Enable the full API by passing the fullAPI option into the WebViewer constructor
Step 3: Add the sample code provided in this guide
This full sample is one of many included in the manual download of WebViewer.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// This sample explores the structure and content of a tagged PDF document and dumps
8// the structure information to the console window.
9//
10// In tagged PDF documents StructTree acts as a central repository for information
11// related to a PDF document's logical structure. The tree consists of StructElement-s
12// and ContentItem-s which are leaf nodes of the structure tree.
13//
14// The sample can be extended to access and extract the marked-content elements such
15// as text and images.
16//---------------------------------------------------------------------------------------
17
18(exports => {
19  exports.runLogicalStructureTest = () => {
20    const PDFNet = exports.Core.PDFNet;
21
22    const PrintAndIndent = (printState, indent) => {
23      if (printState.str) {
24        const indentStr = ' '.repeat(printState.indent * 2);
25        console.log(indentStr + printState.str);
26      }
27      printState.str = '';
28      printState.indent = indent;
29    };
30
31    // Read the structure recursively
32    const ReadDocumentStructure = async (element, parent) => {
33      if (!(await element.isValid())) {
34        return;
35      }
36
37      const [type, numKids] = await Promise.all([element.getType(), element.getNumKids()]);
38
39      const elementData = {
40        type,
41        numKids,
42        isLeaf: false,
43        children: [],
44      };
45
46      if (await element.hasTitle()) {
47        elementData.title = await element.getTitle();
48      }
49
50      parent.children.push(elementData);
51
52      for (let i = 0; i < elementData.numKids; ++i) {
53        // Check is the kid is a leaf node (i.e. it is a ContentItem).
54        const contentItem = {
55          isLeaf: await element.isContentItem(i),
56        };
57        if (contentItem.isLeaf) {
58          const cont = await element.getAsContentItem(i);
59          const [type, page] = await Promise.all([cont.getType(), cont.getPage()]);
60          const pageNum = await page.getIndex();
61
62          contentItem.type = type;
63          contentItem.pageNum = pageNum;
64
65          switch (type) {
66            case PDFNet.ContentItem.Type.e_MCID:
67            case PDFNet.ContentItem.Type.e_MCR:
68              contentItem.mcid = await cont.getMCID();
69              break;
70            case PDFNet.ContentItem.Type.e_OBJR:
71              {
72                const refObj = await cont.getRefObj();
73                if (refObj) {
74                  contentItem.objNum = refObj.getObjNum();
75                }
76              }
77              break;
78            default:
79              break;
80          }
81          elementData.children.push(contentItem);
82        } else {
83          // the kid is another StructElement node.
84          await ReadDocumentStructure(await element.getAsStructElem(i), elementData);
85        }
86      }
87    };
88
89    // Read the elements sequentially with a reader
90    const ReadElements = async doc => {
91      const elements = [];
92      const reader = await PDFNet.ElementReader.create();
93      for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
94        const page = await itr.current();
95        reader.beginOnPage(page);
96        const pageNum = await page.getIndex();
97        let element;
98        while ((element = await reader.next())) {
99          // Read page contents
100          const readElement = {
101            type: await element.getType(),
102            pageNum,
103          };
104          if (readElement.type === PDFNet.Element.Type.e_path || readElement.type === PDFNet.Element.Type.e_text || readElement.type === PDFNet.Element.Type.e_path) {
105            readElement.text = await element.getTextString();
106            // Check if the element is associated with any structural element.
107            // Content items are leaf nodes of the structure tree.
108            const structParent = await element.getParentStructElement();
109            readElement.isValid = await structParent.isValid();
110            if (readElement.isValid) {
111              readElement.structType = await structParent.getType();
112              readElement.mcid = await element.getStructMCID();
113              if (await structParent.hasTitle()) {
114                readElement.title = await structParent.getTitle();
115              }
116              readElement.objNum = await (await structParent.getSDFObj()).getObjNum();
117            }
118            elements.push(readElement);
119          }
120        }
121        reader.end();
122      }
123      return elements;
124    };
125
126    // Used in code snippet 1.
127    const ProcessStructElement = (element, indent, printState) => {
128      // Print out the type and title info, if any.
129      PrintAndIndent(printState, indent++);
130      printState.str += `Type: ${element.type}${element.title ? `. Title: ${element.title}` : ''}`;
131
132      for (let i = 0; i < element.numKids; ++i) {
133        const child = element.children[i];
134        // Check is the kid is a leaf node (i.e. it is a ContentItem).
135        if (child.isLeaf) {
136          PrintAndIndent(printState, indent);
137          printState.str += `Content Item. Part of page #${child.pageNum}`;
138
139          PrintAndIndent(printState, indent);
140          switch (child.type) {
141            case PDFNet.ContentItem.Type.e_MCID:
142            case PDFNet.ContentItem.Type.e_MCR:
143              printState.str += `MCID: ${child.mcid}`;
144              break;
145            case PDFNet.ContentItem.Type.e_OBJR:
146              printState.str += 'OBJR ';
147              if (child.objNum) {
148                printState.str += `- Referenced Object#: ${child.objNum}`;
149              }
150              break;
151            default:
152              break;
153          }
154        } else {
155          // the kid is another StructElement node.
156          ProcessStructElement(child, indent, printState);
157        }
158      }
159    };
160
161    // Used in code snippet 2.
162    const ProcessElementsArray = (elementsArray, printState) => {
163      for (let i = 0; i < elementsArray.length; i++) {
164        // Read page contents
165        const element = elementsArray[i];
166        // In this sample we process only paths & text, but the code can be
167        // extended to handle any element type.
168        if (element.type === PDFNet.Element.Type.e_path || element.type === PDFNet.Element.Type.e_text || element.type === PDFNet.Element.Type.e_path) {
169          switch (element.type) {
170            case PDFNet.Element.Type.e_path: // Process path ...
171              printState.str += '\nPATH: ';
172              break;
173            case PDFNet.Element.Type.e_text: // Process text ...
174              printState.str += `\nTEXT: ${element.text}\n`;
175              break;
176            case PDFNet.Element.Type.e_form: // Process form XObjects
177              printState.str += '\nFORM XObject: ';
178              // reader.formBegin();
179              // await ProcessElements(reader);
180              // reader.end();
181              break;
182          }
183
184          if (element.isValid) {
185            // Print out the parent structural element's type, title, and object number.
186            printState.str += ` Type: ${element.structType}, MCID: ${element.mcid}`;
187            if (element.title) {
188              printState.str += `. Title: ${element.title}`;
189            }
190            printState.str += `, Obj#: ${element.objNum}`;
191          }
192        }
193      }
194    };
195
196    // Used in code snippet 3.
197    const CreateMCIDDocMap = elementsArray => {
198      const mcidDocMap = {};
199      for (let i = 0; i < elementsArray.length; i++) {
200        const element = elementsArray[i];
201        if (!mcidDocMap[element.pageNum]) {
202          mcidDocMap[element.pageNum] = {};
203        }
204        const pageMcidMap = mcidDocMap[element.pageNum];
205        if (element.mcid >= 0 && element.type === PDFNet.Element.Type.e_text) {
206          if (element.mcid in pageMcidMap) {
207            pageMcidMap[element.mcid] += element.text;
208          } else {
209            pageMcidMap[element.mcid] = element.text;
210          }
211        }
212      }
213      return mcidDocMap;
214    };
215
216    // Used in code snippet 3.
217    const ProcessStructElement2 = (element, mcidDocMap, indent, printState) => {
218      // Print out the type and title info, if any.
219      PrintAndIndent(printState, indent);
220      printState.str += `<${element.type}${element.title ? ` title="${element.title}"` : ''}>`;
221
222      for (let i = 0; i < element.numKids; ++i) {
223        const child = element.children[i];
224        if (child.isLeaf) {
225          if (child.type === PDFNet.ContentItem.Type.e_MCID) {
226            const pageNum = child.pageNum;
227            const mcidPageMap = mcidDocMap[pageNum];
228            if (mcidPageMap) {
229              const mcid = child.mcid;
230              if (mcid in mcidPageMap) {
231                printState.str += mcidPageMap[mcid];
232              }
233            }
234          }
235        } else {
236          // the kid is another StructElement node.
237          ProcessStructElement2(child, mcidDocMap, indent + 1, printState);
238        }
239      }
240
241      PrintAndIndent(printState, indent);
242      printState.str += `</${element.type}>`;
243    };
244
245    const main = async () => {
246      // Relative path to the folder containing test files.
247      const inputPath = '../TestFiles/';
248      const printState = { str: '' };
249      try {
250        // Extract logical structure from a PDF document
251        const doc = await PDFNet.PDFDoc.createFromURL(`${inputPath}tagged.pdf`);
252        doc.initSecurityHandler();
253
254        const tree = await doc.getStructTree();
255        const hasValidTree = await tree.isValid();
256        const numKidsFromRoot = await tree.getNumKids();
257        const structRoot = {
258          children: [],
259        };
260        let elementsArray = [];
261
262        if (hasValidTree) {
263          console.log('Document has a StructTree root.');
264          const [, elementsArr] = await Promise.all([
265            new Promise(async res => {
266              for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
267                // Recursively get structure info for all child elements.
268                await ReadDocumentStructure(await tree.getKid(i), structRoot);
269              }
270              res();
271            }),
272            ReadElements(doc),
273          ]);
274          elementsArray = elementsArr;
275        } else {
276          console.log('This document does not contain any logical structure.');
277        }
278
279        console.log('____________________________________________________________');
280        console.log('Sample 1 - Traverse logical structure tree...');
281        for (let i = 0; i < structRoot.children.length; ++i) {
282          // Recursively get structure info for all child elements.
283          ProcessStructElement(structRoot.children[i], 0, printState);
284        }
285        PrintAndIndent(printState, 0);
286        console.log('Done 1.');
287
288        console.log('____________________________________________________________');
289        console.log('Sample 2 - Get parent logical structure elements from');
290        console.log('layout elements.');
291        ProcessElementsArray(elementsArray, printState);
292        PrintAndIndent(printState, 0);
293        console.log('Done 2.');
294
295        console.log('____________________________________________________________');
296        console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
297        {
298          const mcidDocMap = CreateMCIDDocMap(elementsArray);
299          if (hasValidTree) {
300            for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
301              ProcessStructElement2(structRoot.children[i], mcidDocMap, 0, printState);
302            }
303          }
304        }
305        PrintAndIndent(printState, 0);
306        console.log('Done 3.');
307        const docBuffer = await doc.saveMemoryBuffer(0);
308        saveBufferAsPDFDoc(docBuffer, 'bookmark.pdf');
309      } catch (err) {
310        console.log(err);
311      }
312    };
313
314    // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
315    PDFNet.runWithCleanup(main);
316  };
317})(window);
318// eslint-disable-next-line spaced-comment
319//# sourceURL=LogicalStructureTest.js
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

Logical Structure Reader - PDF Sample Code

Implementation steps