PDF Logical Structure Reader - Python Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14#---------------------------------------------------------------------------------------
15# This sample explores the structure and content of a tagged PDF document and dumps
16# the structure information to the console window.
17#
18# In tagged PDF documents StructTree acts as a central repository for information
19# related to a PDF document's logical structure. The tree consists of StructElement-s
20# and ContentItem-s which are leaf nodes of the structure tree.
21#
22# The sample can be extended to access and extract the marked-content elements such
23# as text and images.
24#---------------------------------------------------------------------------------------
25
26def PrintIndent(indent):
27 sys.stdout.write("\n")
28 i = 0
29 while i < indent:
30 sys.stdout.write(" ")
31 i = i + 1
32
33def ProcessStructElement(element, indent):
34 if not element.IsValid():
35 return
36
37 # Print out the type and title info, if any.
38 PrintIndent(indent)
39 indent = indent + 1
40 sys.stdout.write("Type: " + element.GetType())
41 if element.HasTitle():
42 sys.stdout.write(". Title:" + element.GetTitle())
43
44 num = element.GetNumKids()
45 i = 0
46 while i < num:
47 # Check if the kid is a leaf node (i.e. it is a ContentItem)
48 if element.IsContentItem(i):
49 cont = element.GetAsContentItem(i)
50 type = cont.GetType()
51
52 page = cont.GetPage()
53
54 PrintIndent(indent)
55 sys.stdout.write("Content Item. Part of page #" + str(page.GetIndex()))
56 PrintIndent(indent)
57 if type == ContentItem.e_MCID:
58 sys.stdout.write("MCID: " + str(cont.GetMCID()))
59 elif type == ContentItem.e_MCR:
60 sys.stdout.write("MCID: " + str(cont.GetMCID()))
61 elif type == ContentItem.e_OBJR:
62 sys.stdout.write("OBJR ")
63 ref_obj = cont.GetRefObj()
64 if ref_obj != None:
65 sys.stdout.write("- Referenced Object#: " + str(ref_obj.GetObjNum()))
66 else:
67 ProcessStructElement(element.GetAsStructElem(i), indent)
68 i = i + 1
69
70
71# Used in code snippet 3.
72def ProcessElements2(reader, mcid_page_map):
73 element = reader.Next()
74 while element != None: # Read page contents
75 # In this sample we process only text, but the code can be extended
76 # to handle paths, images, or other Element type.
77 mcid = element.GetStructMCID()
78
79 if mcid>=0 and element.GetType() == Element.e_text:
80 val = element.GetTextString()
81
82 if mcid in mcid_page_map:
83 mcid_page_map[mcid] = str(mcid_page_map[mcid]) + val
84 else:
85 mcid_page_map[mcid] = val
86 element = reader.Next()
87
88# Used in code snippet 2.
89def ProcessElements(reader):
90 element = reader.Next()
91 while element != None: # Read page contents
92 # In this sample we process only paths & text, but the code can be
93 # extended to handle any element type.
94 type = element.GetType()
95 if (type == Element.e_path or
96 type == Element.e_text or
97 type == Element.e_path):
98 if type == Element.e_path: # Process path ...
99 sys.stdout.write("\nPATH: ")
100 elif type == Element.e_text: # Process text ...
101 sys.stdout.write("\nTEXT: " + element.GetTextString() + "\n")
102 elif type == Element.e_path: # Process from XObjects
103 sys.stdout.write("\nFORM XObject: ")
104
105 # Check if the element is associated with any structural element.
106 # Content items are leaf nodes of the structure tree.
107 struct_parent = element.GetParentStructElement()
108 if struct_parent.IsValid():
109 # Print out the parent structural element's type, title, and object number.
110 sys.stdout.write(" Type: " + str(struct_parent.GetType())
111 + ", MCID: " + str(element.GetStructMCID()))
112 if struct_parent.HasTitle():
113 sys.stdout.write(". Title: " + struct_parent.GetTitle())
114 sys.stdout.write(", Obj#: " + str(struct_parent.GetSDFObj().GetObjNum()))
115 element = reader.Next()
116
117
118def ProcessStructElement2(element, mcid_doc_map, indent):
119 if not element.IsValid():
120 return
121
122 # Print out the type and title info, if any
123 PrintIndent(indent)
124 sys.stdout.write("<" + element.GetType())
125 if element.HasTitle():
126 sys.stdout.write(" title=\"" + element.GetTitle() + "\"")
127 sys.stdout.write(">")
128
129 num = element.GetNumKids()
130 i = 0
131 while i < num:
132 if element.IsContentItem(i):
133 cont = element.GetAsContentItem(i)
134 if cont.GetType() == ContentItem.e_MCID:
135 page_num = cont.GetPage().GetIndex()
136 if page_num in mcid_doc_map:
137 mcid_page_map = mcid_doc_map[page_num]
138 mcid_key = cont.GetMCID()
139 if mcid_key in mcid_page_map:
140 sys.stdout.write(mcid_page_map[mcid_key])
141 else: # the kid is another StructElement node.
142 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)
143 i = i + 1
144 PrintIndent(indent)
145 sys.stdout.write("</" + element.GetType() + ">")
146
147
148def main():
149 PDFNet.Initialize(LicenseKey)
150
151 # Relative path to the folder containing the test files.
152 input_path = "../../TestFiles/"
153 output_path = "../../TestFiles/Output/"
154
155 # Extract logical structure from a PDF document
156 doc = PDFDoc(input_path + "tagged.pdf")
157 doc.InitSecurityHandler()
158
159 print("____________________________________________________________")
160 print("Sample 1 - Traverse logical structure tree...")
161
162 tree = doc.GetStructTree()
163 if tree.IsValid():
164 print("Document has a StructTree root.")
165
166 i = 0
167 while i<tree.GetNumKids():
168 # Recursively get structure info for all child elements.
169 ProcessStructElement(tree.GetKid(i), 0)
170 i = i + 1
171 else:
172 print("This document does not contain any logical structure.")
173
174 print("\nDone 1.")
175
176 print("____________________________________________________________")
177 print("Sample 2 - Get parent logical structure elements from")
178 print("layout elements.")
179
180 reader = ElementReader()
181 itr = doc.GetPageIterator()
182 while itr.HasNext():
183 reader.Begin(itr.Current())
184 ProcessElements(reader)
185 reader.End()
186 itr.Next()
187
188 print("\nDone 2.")
189
190 print("____________________________________________________________")
191 print("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
192 # A map which maps page numbers(as Integers)
193 # to page Maps(which map from struct mcid(as Integers) to
194 # text Strings)
195 mcid_doc_map = dict()
196 reader = ElementReader()
197 itr = doc.GetPageIterator()
198 while itr.HasNext():
199 reader.Begin(itr.Current())
200 page_mcid_map = dict()
201 mcid_doc_map[itr.Current().GetIndex()] = page_mcid_map
202 ProcessElements2(reader, page_mcid_map)
203 reader.End()
204 itr.Next()
205 tree = doc.GetStructTree()
206 if tree.IsValid():
207 i = 0
208 while i < tree.GetNumKids():
209 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
210 i = i + 1
211 print("\nDone 3.")
212 doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc.e_linearized)
213 doc.Close()
214 PDFNet.Terminate()
215
216if __name__ == '__main__':
217 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales
PDF Logical Structure Reader with Server SDK in Python | Apryse documentation