LogicalStructure

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14#---------------------------------------------------------------------------------------
15# This sample explores the structure and content of a tagged PDF document and dumps
16# the structure information to the console window.
17#
18# In tagged PDF documents StructTree acts as a central repository for information
19# related to a PDF document's logical structure. The tree consists of StructElement-s
20# and ContentItem-s which are leaf nodes of the structure tree.
21#
22# The sample can be extended to access and extract the marked-content elements such
23# as text and images.
24#---------------------------------------------------------------------------------------
25
26def PrintIndent(indent):
27 sys.stdout.write("\n")
28 i = 0
29 while i < indent:
30 sys.stdout.write(" ")
31 i = i + 1
32
33def ProcessStructElement(element, indent):
34 if not element.IsValid():
35 return
36
37 # Print out the type and title info, if any.
38 PrintIndent(indent)
39 indent = indent + 1
40 sys.stdout.write("Type: " + element.GetType())
41 if element.HasTitle():
42 sys.stdout.write(". Title:" + element.GetTitle())
43
44 num = element.GetNumKids()
45 i = 0
46 while i < num:
47 # Check if the kid is a leaf node (i.e. it is a ContentItem)
48 if element.IsContentItem(i):
49 cont = element.GetAsContentItem(i)
50 type = cont.GetType()
51
52 page = cont.GetPage()
53
54 PrintIndent(indent)
55 sys.stdout.write("Content Item. Part of page #" + str(page.GetIndex()))
56 PrintIndent(indent)
57 if type == ContentItem.e_MCID:
58 sys.stdout.write("MCID: " + str(cont.GetMCID()))
59 elif type == ContentItem.e_MCR:
60 sys.stdout.write("MCID: " + str(cont.GetMCID()))
61 elif type == ContentItem.e_OBJR:
62 sys.stdout.write("OBJR ")
63 ref_obj = cont.GetRefObj()
64 if ref_obj != None:
65 sys.stdout.write("- Referenced Object#: " + str(ref_obj.GetObjNum()))
66 else:
67 ProcessStructElement(element.GetAsStructElem(i), indent)
68 i = i + 1
69
70
71# Used in code snippet 3.
72def ProcessElements2(reader, mcid_page_map):
73 element = reader.Next()
74 while element != None: # Read page contents
75 # In this sample we process only text, but the code can be extended
76 # to handle paths, images, or other Element type.
77 mcid = element.GetStructMCID()
78
79 if mcid>=0 and element.GetType() == Element.e_text:
80 val = element.GetTextString()
81
82 if mcid in mcid_page_map:
83 mcid_page_map[mcid] = str(mcid_page_map[mcid]) + val
84 else:
85 mcid_page_map[mcid] = val
86 element = reader.Next()
87
88# Used in code snippet 2.
89def ProcessElements(reader):
90 element = reader.Next()
91 while element != None: # Read page contents
92 # In this sample we process only paths & text, but the code can be
93 # extended to handle any element type.
94 type = element.GetType()
95 if (type == Element.e_path or
96 type == Element.e_text or
97 type == Element.e_path):
98 if type == Element.e_path: # Process path ...
99 sys.stdout.write("\nPATH: ")
100 elif type == Element.e_text: # Process text ...
101 sys.stdout.write("\nTEXT: " + element.GetTextString() + "\n")
102 elif type == Element.e_path: # Process from XObjects
103 sys.stdout.write("\nFORM XObject: ")
104
105 # Check if the element is associated with any structural element.
106 # Content items are leaf nodes of the structure tree.
107 struct_parent = element.GetParentStructElement()
108 if struct_parent.IsValid():
109 # Print out the parent structural element's type, title, and object number.
110 sys.stdout.write(" Type: " + str(struct_parent.GetType())
111 + ", MCID: " + str(element.GetStructMCID()))
112 if struct_parent.HasTitle():
113 sys.stdout.write(". Title: " + struct_parent.GetTitle())
114 sys.stdout.write(", Obj#: " + str(struct_parent.GetSDFObj().GetObjNum()))
115 element = reader.Next()
116
117
118def ProcessStructElement2(element, mcid_doc_map, indent):
119 if not element.IsValid():
120 return
121
122 # Print out the type and title info, if any
123 PrintIndent(indent)
124 sys.stdout.write("<" + element.GetType())
125 if element.HasTitle():
126 sys.stdout.write(" title=\"" + element.GetTitle() + "\"")
127 sys.stdout.write(">")
128
129 num = element.GetNumKids()
130 i = 0
131 while i < num:
132 if element.IsContentItem(i):
133 cont = element.GetAsContentItem(i)
134 if cont.GetType() == ContentItem.e_MCID:
135 page_num = cont.GetPage().GetIndex()
136 if page_num in mcid_doc_map:
137 mcid_page_map = mcid_doc_map[page_num]
138 mcid_key = cont.GetMCID()
139 if mcid_key in mcid_page_map:
140 sys.stdout.write(mcid_page_map[mcid_key])
141 else: # the kid is another StructElement node.
142 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)
143 i = i + 1
144 PrintIndent(indent)
145 sys.stdout.write("</" + element.GetType() + ">")
146
147
148def main():
149 PDFNet.Initialize(LicenseKey)
150
151 # Relative path to the folder containing the test files.
152 input_path = "../../TestFiles/"
153 output_path = "../../TestFiles/Output/"
154
155 # Extract logical structure from a PDF document
156 doc = PDFDoc(input_path + "tagged.pdf")
157 doc.InitSecurityHandler()
158
159 print("____________________________________________________________")
160 print("Sample 1 - Traverse logical structure tree...")
161
162 tree = doc.GetStructTree()
163 if tree.IsValid():
164 print("Document has a StructTree root.")
165
166 i = 0
167 while i<tree.GetNumKids():
168 # Recursively get structure info for all child elements.
169 ProcessStructElement(tree.GetKid(i), 0)
170 i = i + 1
171 else:
172 print("This document does not contain any logical structure.")
173
174 print("\nDone 1.")
175
176 print("____________________________________________________________")
177 print("Sample 2 - Get parent logical structure elements from")
178 print("layout elements.")
179
180 reader = ElementReader()
181 itr = doc.GetPageIterator()
182 while itr.HasNext():
183 reader.Begin(itr.Current())
184 ProcessElements(reader)
185 reader.End()
186 itr.Next()
187
188 print("\nDone 2.")
189
190 print("____________________________________________________________")
191 print("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
192 # A map which maps page numbers(as Integers)
193 # to page Maps(which map from struct mcid(as Integers) to
194 # text Strings)
195 mcid_doc_map = dict()
196 reader = ElementReader()
197 itr = doc.GetPageIterator()
198 while itr.HasNext():
199 reader.Begin(itr.Current())
200 page_mcid_map = dict()
201 mcid_doc_map[itr.Current().GetIndex()] = page_mcid_map
202 ProcessElements2(reader, page_mcid_map)
203 reader.End()
204 itr.Next()
205 tree = doc.GetStructTree()
206 if tree.IsValid():
207 i = 0
208 while i < tree.GetNumKids():
209 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
210 i = i + 1
211 print("\nDone 3.")
212 doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc.e_linearized)
213 doc.Close()
214 PDFNet.Terminate()
215
216if __name__ == '__main__':
217 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales