Logical structure reader

Sample JavaScript code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree.
Learn more about our Web SDK and PDF Data Extraction SDK Capabilities.
This sample works with Full-API for WebViewer.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// This sample explores the structure and content of a tagged PDF document and dumps
8// the structure information to the console window.
9//
10// In tagged PDF documents StructTree acts as a central repository for information
11// related to a PDF document's logical structure. The tree consists of StructElement-s
12// and ContentItem-s which are leaf nodes of the structure tree.
13//
14// The sample can be extended to access and extract the marked-content elements such
15// as text and images.
16//---------------------------------------------------------------------------------------
17
18(exports => {
19
20
21
22
23
24
25
26
27
28
29  exports.runLogicalStructureTest = () => {
30    const PDFNet = exports.Core.PDFNet;
31
32    const PrintAndIndent = (printState, indent) => {
33      if (printState.str) {
34        const indentStr = ' '.repeat(printState.indent * 2);
35        console.log(indentStr + printState.str);
36      }
37      printState.str = '';
38      printState.indent = indent;
39    };
40
41    // Read the structure recursively
42    const ReadDocumentStructure = async (element, parent) => {
43      if (!(await element.isValid())) {
44        return;
45      }
46
47      const [type, numKids] = await Promise.all([element.getType(), element.getNumKids()]);
48
49      const elementData = {
50        type,
51        numKids,
52        isLeaf: false,
53        children: [],
54      };
55
56      if (await element.hasTitle()) {
57        elementData.title = await element.getTitle();
58      }
59
60      parent.children.push(elementData);
61
62      for (let i = 0; i < elementData.numKids; ++i) {
63        // Check is the kid is a leaf node (i.e. it is a ContentItem).
64        const contentItem = {
65          isLeaf: await element.isContentItem(i),
66        };
67        if (contentItem.isLeaf) {
68          const cont = await element.getAsContentItem(i);
69          const [type, page] = await Promise.all([cont.getType(), cont.getPage()]);
70          const pageNum = await page.getIndex();
71
72          contentItem.type = type;
73          contentItem.pageNum = pageNum;
74
75          switch (type) {
76            case PDFNet.ContentItem.Type.e_MCID:
77            case PDFNet.ContentItem.Type.e_MCR:
78              contentItem.mcid = await cont.getMCID();
79              break;
80            case PDFNet.ContentItem.Type.e_OBJR:
81              {
82                const refObj = await cont.getRefObj();
83                if (refObj) {
84                  contentItem.objNum = refObj.getObjNum();
85                }
86              }
87              break;
88            default:
89              break;
90          }
91          elementData.children.push(contentItem);
92        } else {
93          // the kid is another StructElement node.
94          await ReadDocumentStructure(await element.getAsStructElem(i), elementData);
95        }
96      }
97    };
98
99    // Read the elements sequentially with a reader
100    const ReadElements = async doc => {
101      const elements = [];
102      const reader = await PDFNet.ElementReader.create();
103      for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
104        const page = await itr.current();
105        reader.beginOnPage(page);
106        const pageNum = await page.getIndex();
107        let element;
108        while ((element = await reader.next())) {
109          // Read page contents
110          const readElement = {
111            type: await element.getType(),
112            pageNum,
113          };
114          if (readElement.type === PDFNet.Element.Type.e_path || readElement.type === PDFNet.Element.Type.e_text || readElement.type === PDFNet.Element.Type.e_path) {
115            readElement.text = await element.getTextString();
116            // Check if the element is associated with any structural element.
117            // Content items are leaf nodes of the structure tree.
118            const structParent = await element.getParentStructElement();
119            readElement.isValid = await structParent.isValid();
120            if (readElement.isValid) {
121              readElement.structType = await structParent.getType();
122              readElement.mcid = await element.getStructMCID();
123              if (await structParent.hasTitle()) {
124                readElement.title = await structParent.getTitle();
125              }
126              readElement.objNum = await (await structParent.getSDFObj()).getObjNum();
127            }
128            elements.push(readElement);
129          }
130        }
131        reader.end();
132      }
133      return elements;
134    };
135
136    // Used in code snippet 1.
137    const ProcessStructElement = (element, indent, printState) => {
138      // Print out the type and title info, if any.
139      PrintAndIndent(printState, indent++);
140      printState.str += `Type: ${element.type}${element.title ? `. Title: ${element.title}` : ''}`;
141
142      for (let i = 0; i < element.numKids; ++i) {
143        const child = element.children[i];
144        // Check is the kid is a leaf node (i.e. it is a ContentItem).
145        if (child.isLeaf) {
146          PrintAndIndent(printState, indent);
147          printState.str += `Content Item. Part of page #${child.pageNum}`;
148
149          PrintAndIndent(printState, indent);
150          switch (child.type) {
151            case PDFNet.ContentItem.Type.e_MCID:
152            case PDFNet.ContentItem.Type.e_MCR:
153              printState.str += `MCID: ${child.mcid}`;
154              break;
155            case PDFNet.ContentItem.Type.e_OBJR:
156              printState.str += 'OBJR ';
157              if (child.objNum) {
158                printState.str += `- Referenced Object#: ${child.objNum}`;
159              }
160              break;
161            default:
162              break;
163          }
164        } else {
165          // the kid is another StructElement node.
166          ProcessStructElement(child, indent, printState);
167        }
168      }
169    };
170
171    // Used in code snippet 2.
172    const ProcessElementsArray = (elementsArray, printState) => {
173      for (let i = 0; i < elementsArray.length; i++) {
174        // Read page contents
175        const element = elementsArray[i];
176        // In this sample we process only paths & text, but the code can be
177        // extended to handle any element type.
178        if (element.type === PDFNet.Element.Type.e_path || element.type === PDFNet.Element.Type.e_text || element.type === PDFNet.Element.Type.e_path) {
179          switch (element.type) {
180            case PDFNet.Element.Type.e_path: // Process path ...
181              printState.str += '\nPATH: ';
182              break;
183            case PDFNet.Element.Type.e_text: // Process text ...
184              printState.str += `\nTEXT: ${element.text}\n`;
185              break;
186            case PDFNet.Element.Type.e_form: // Process form XObjects
187              printState.str += '\nFORM XObject: ';
188              // reader.formBegin();
189              // await ProcessElements(reader);
190              // reader.end();
191              break;
192          }
193
194          if (element.isValid) {
195            // Print out the parent structural element's type, title, and object number.
196            printState.str += ` Type: ${element.structType}, MCID: ${element.mcid}`;
197            if (element.title) {
198              printState.str += `. Title: ${element.title}`;
199            }
200            printState.str += `, Obj#: ${element.objNum}`;
201          }
202        }
203      }
204    };
205
206    // Used in code snippet 3.
207    const CreateMCIDDocMap = elementsArray => {
208      const mcidDocMap = {};
209      for (let i = 0; i < elementsArray.length; i++) {
210        const element = elementsArray[i];
211        if (!mcidDocMap[element.pageNum]) {
212          mcidDocMap[element.pageNum] = {};
213        }
214        const pageMcidMap = mcidDocMap[element.pageNum];
215        if (element.mcid >= 0 && element.type === PDFNet.Element.Type.e_text) {
216          if (element.mcid in pageMcidMap) {
217            pageMcidMap[element.mcid] += element.text;
218          } else {
219            pageMcidMap[element.mcid] = element.text;
220          }
221        }
222      }
223      return mcidDocMap;
224    };
225
226    // Used in code snippet 3.
227    const ProcessStructElement2 = (element, mcidDocMap, indent, printState) => {
228      // Print out the type and title info, if any.
229      PrintAndIndent(printState, indent);
230      printState.str += `<${element.type}${element.title ? ` title="${element.title}"` : ''}>`;
231
232      for (let i = 0; i < element.numKids; ++i) {
233        const child = element.children[i];
234        if (child.isLeaf) {
235          if (child.type === PDFNet.ContentItem.Type.e_MCID) {
236            const pageNum = child.pageNum;
237            const mcidPageMap = mcidDocMap[pageNum];
238            if (mcidPageMap) {
239              const mcid = child.mcid;
240              if (mcid in mcidPageMap) {
241                printState.str += mcidPageMap[mcid];
242              }
243            }
244          }
245        } else {
246          // the kid is another StructElement node.
247          ProcessStructElement2(child, mcidDocMap, indent + 1, printState);
248        }
249      }
250
251      PrintAndIndent(printState, indent);
252      printState.str += `</${element.type}>`;
253    };
254
255    const main = async () => {
256      // Relative path to the folder containing test files.
257      const inputPath = '../TestFiles/';
258      const printState = { str: '' };
259      try {
260        // Extract logical structure from a PDF document
261        const doc = await PDFNet.PDFDoc.createFromURL(`${inputPath}tagged.pdf`);
262        doc.initSecurityHandler();
263
264        const tree = await doc.getStructTree();
265        const hasValidTree = await tree.isValid();
266        const numKidsFromRoot = await tree.getNumKids();
267        const structRoot = {
268          children: [],
269        };
270        let elementsArray = [];
271
272        if (hasValidTree) {
273          console.log('Document has a StructTree root.');
274          const [, elementsArr] = await Promise.all([
275            new Promise(async res => {
276              for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
277                // Recursively get structure info for all child elements.
278                await ReadDocumentStructure(await tree.getKid(i), structRoot);
279              }
280              res();
281            }),
282            ReadElements(doc),
283          ]);
284          elementsArray = elementsArr;
285        } else {
286          console.log('This document does not contain any logical structure.');
287        }
288
289        console.log('____________________________________________________________');
290        console.log('Sample 1 - Traverse logical structure tree...');
291        for (let i = 0; i < structRoot.children.length; ++i) {
292          // Recursively get structure info for all child elements.
293          ProcessStructElement(structRoot.children[i], 0, printState);
294        }
295        PrintAndIndent(printState, 0);
296        console.log('Done 1.');
297
298        console.log('____________________________________________________________');
299        console.log('Sample 2 - Get parent logical structure elements from');
300        console.log('layout elements.');
301        ProcessElementsArray(elementsArray, printState);
302        PrintAndIndent(printState, 0);
303        console.log('Done 2.');
304
305        console.log('____________________________________________________________');
306        console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
307        {
308          const mcidDocMap = CreateMCIDDocMap(elementsArray);
309          if (hasValidTree) {
310            for (let i = 0, numKids = numKidsFromRoot; i < numKids; ++i) {
311              ProcessStructElement2(structRoot.children[i], mcidDocMap, 0, printState);
312            }
313          }
314        }
315        PrintAndIndent(printState, 0);
316        console.log('Done 3.');
317        const docBuffer = await doc.saveMemoryBuffer(0);
318        saveBufferAsPDFDoc(docBuffer, 'bookmark.pdf');
319      } catch (err) {
320        console.log(err);
321      }
322    };
323
324    // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
325    PDFNet.runWithCleanup(main);
326  };
327})(window);
328// eslint-disable-next-line spaced-comment
329//# sourceURL=LogicalStructureTest.js
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

Logical structure reader