PDF Logical Structure Reader - Ruby Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# This sample explores the structure and content of a tagged PDF document and dumps
14# the structure information to the console window.
15#
16# In tagged PDF documents StructTree acts as a central repository for information
17# related to a PDF document's logical structure. The tree consists of StructElement-s
18# and ContentItem-s which are leaf nodes of the structure tree.
19#
20# The sample can be extended to access and extract the marked-content elements such
21# as text and images.
22#---------------------------------------------------------------------------------------
23
24def PrintIndent(indent)
25 print "\n"
26 i = 0
27 while i < indent
28 print " "
29 i = i + 1
30 end
31end
32
33def ProcessStructElement(element, indent)
34 if !element.IsValid
35 return
36 end
37
38 # Print out the type and title info, if any.
39 PrintIndent(indent)
40 indent = indent + 1
41 print "Type: " + element.GetType
42 if element.HasTitle
43 print ". Title:" + element.GetTitle
44 end
45
46 num = element.GetNumKids
47 i = 0
48 while i < num do
49 # Check if the kid is a leaf node (i.e. it is a ContentItem)
50 if element.IsContentItem(i)
51 cont = element.GetAsContentItem(i)
52 type = cont.GetType
53
54 page = cont.GetPage
55
56 PrintIndent(indent)
57 print "Content Item. Part of page #" + page.GetIndex.to_s
58 PrintIndent(indent)
59 case type
60 when ContentItem::E_MCID
61 print "MCID: " + cont.GetMCID.to_s
62 when ContentItem::E_MCR
63 print "MCID: " + cont.GetMCID.to_s
64 when ContentItem::E_OBJR
65 print "OBJR "
66 ref_obj = cont.GetRefObj
67 if !ref_obj.nil?
68 print "- Referenced Object#: " + ref_obj.GetObjNum.to_s
69 end
70 end
71 else
72 ProcessStructElement(element.GetAsStructElem(i), indent)
73 end
74 i = i + 1
75 end
76end
77
78# Used in code snippet 3.
79def ProcessElements2(reader)
80 mcid_page_map = Hash.new
81 element = reader.Next
82 while !element.nil? do # Read page contents
83 # In this sample we process only text, but the code can be extended
84 # to handle paths, images, or other Element type.
85 mcid = element.GetStructMCID
86
87 if mcid>=0 and element.GetType == Element::E_text
88 val = element.GetTextString
89
90 if mcid_page_map.has_key?(mcid)
91 mcid_page_map[mcid] = mcid_page_map[mcid].to_s + val
92 else
93 mcid_page_map[mcid] = val
94 end
95 end
96 element = reader.Next
97 end
98 return mcid_page_map
99end
100
101# Used in code snippet 2.
102def ProcessElements(reader)
103 element = reader.Next
104 while !element.nil? do # Read page contents
105 # In this sample we process only paths & text, but the code can be
106 # extended to handle any element type.
107 type = element.GetType
108 if (type == Element::E_path or
109 type == Element::E_text or
110 type == Element::E_path)
111 case type
112 when Element::E_path # Process path ...
113 print "\nPATH: "
114 when Element::E_text # Process text ...
115 print "\nTEXT: " + element.GetTextString + "\n"
116 when Element::E_path # Process from XObjects
117 print "\nFORM XObject: "
118 end
119
120 # Check if the element is associated with any structural element.
121 # Content items are leaf nodes of the structure tree.
122 struct_parent = element.GetParentStructElement
123 if struct_parent.IsValid
124 # Print out the parent structural element's type, title, and object number.
125 print " Type: " + struct_parent.GetType.to_s + ", MCID: " + element.GetStructMCID.to_s
126 if struct_parent.HasTitle
127 print ". Title: " + struct_parent.GetTitle
128 end
129 print ", Obj#: " + struct_parent.GetSDFObj.GetObjNum.to_s
130 end
131 end
132 element = reader.Next
133 end
134end
135
136def ProcessStructElement2(element, mcid_doc_map, indent)
137 if !element.IsValid
138 return
139 end
140
141 # Print out the type and title info, if any
142 PrintIndent(indent)
143 print "<" + element.GetType
144 if element.HasTitle
145 print " title=\"" + element.GetTitle + "\""
146 end
147 print ">"
148
149 num = element.GetNumKids
150 i = 0
151 while i < num do
152 if element.IsContentItem(i)
153 cont = element.GetAsContentItem(i)
154 if cont.GetType == ContentItem::E_MCID
155 page_num = cont.GetPage.GetIndex
156 if mcid_doc_map.has_key?(page_num)
157 mcid_page_map = mcid_doc_map[page_num]
158 mcid_key = cont.GetMCID
159 if mcid_page_map.has_key?(mcid_key)
160 print mcid_page_map[mcid_key]
161 end
162 end
163 end
164 else # the kid is another StructElement node.
165 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)
166 end
167 i = i + 1
168 end
169 PrintIndent(indent)
170 print "</" + element.GetType + ">"
171end
172
173 PDFNet.Initialize(PDFTronLicense.Key)
174
175 # Relative path to the folder containing the test files.
176 input_path = "../../TestFiles/"
177 output_path = "../../TestFiles/Output/"
178
179 # Extract logical structure from a PDF document
180 doc = PDFDoc.new(input_path + "tagged.pdf")
181 doc.InitSecurityHandler
182
183 puts "____________________________________________________________"
184 puts "Sample 1 - Traverse logical structure tree..."
185
186 tree = doc.GetStructTree
187 if tree.IsValid
188 puts "Document has a StructTree root."
189
190 i = 0
191 while i<tree.GetNumKids do
192 # Recursively get structure info for all child elements.
193 ProcessStructElement(tree.GetKid(i), 0)
194 i = i + 1
195 end
196 else
197 puts "This document does not contain any logical structure."
198 end
199
200 puts "\nDone 1."
201
202 puts "____________________________________________________________"
203 puts "Sample 2 - Get parent logical structure elements from"
204 puts "layout elements."
205
206 reader = ElementReader.new
207 itr = doc.GetPageIterator
208 while itr.HasNext do
209 reader.Begin(itr.Current)
210 ProcessElements(reader)
211 reader.End
212 itr.Next
213 end
214
215 puts "\nDone 2."
216
217 puts "____________________________________________________________"
218 puts "Sample 3 - 'XML style' extraction of PDF logical structure and page content."
219
220 # A map which maps page numbers(as Integers)
221 # to page Maps(which map from struct mcid(as Integers) to
222 # text Strings)
223
224 mcid_doc_map = Hash.new
225 reader = ElementReader.new
226 itr = doc.GetPageIterator
227 while itr.HasNext do
228 reader.Begin(itr.Current)
229 mcid_doc_map[itr.Current.GetIndex] = ProcessElements2(reader)
230 reader.End
231 itr.Next
232 end
233 tree = doc.GetStructTree
234 if tree.IsValid
235 i = 0
236 while i < tree.GetNumKids do
237 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
238 i = i + 1
239 end
240 end
241 puts "\nDone 3."
242 doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc::E_linearized)
243 doc.Close
244 PDFNet.Terminate

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales