PDF Logical Structure Reader - Go Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "strconv"
10 "os"
11 . "pdftron"
12)
13
14import "pdftron/Samples/LicenseKey/GO"
15
16//---------------------------------------------------------------------------------------
17// This sample explores the structure and content of a tagged PDF document and dumps
18// the structure information to the console window.
19//
20// In tagged PDF documents StructTree acts as a central repository for information
21// related to a PDF document's logical structure. The tree consists of StructElement-s
22// and ContentItem-s which are leaf nodes of the structure tree.
23//
24// The sample can be extended to access and extract the marked-content elements such
25// as text and images.
26//---------------------------------------------------------------------------------------
27
28func PrintIndent(indent int){
29 os.Stdout.Write([]byte("\n"))
30 i := 0
31 for i < indent{
32 os.Stdout.Write([]byte(" "))
33 i = i + 1
34 }
35}
36
37func ProcessStructElement(element SElement, indent int){
38 if !element.IsValid(){
39 return
40 }
41
42 // Print out the type and title info, if any.
43 PrintIndent(indent)
44 indent = indent + 1
45 os.Stdout.Write([]byte("Type: " + element.GetType()))
46 if element.HasTitle(){
47 os.Stdout.Write([]byte(". Title:" + element.GetTitle()))
48 }
49 num := element.GetNumKids()
50 i := 0
51 for i < num{
52 // Check if the kid is a leaf node (i.e. it is a ContentItem)
53 if element.IsContentItem(i){
54 cont := element.GetAsContentItem(i)
55 etype := cont.GetType()
56
57 page := cont.GetPage()
58
59 PrintIndent(indent)
60 os.Stdout.Write([]byte("Content Item. Part of page //" + strconv.Itoa(page.GetIndex())))
61 PrintIndent(indent)
62 if etype == ContentItemE_MCID{
63 os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
64 }else if etype == ContentItemE_MCR{
65 os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
66 }else if etype == ContentItemE_OBJR{
67 os.Stdout.Write([]byte("OBJR "))
68 refObj := cont.GetRefObj()
69 if refObj != nil{
70 os.Stdout.Write([]byte("- Referenced Object//: " + strconv.Itoa(int(refObj.GetObjNum()))))
71 }
72 }
73 }else{
74 ProcessStructElement(element.GetAsStructElem(i), indent)
75 }
76 i = i + 1
77 }
78}
79
80// Used in code snippet 3.
81func ProcessElements2(reader ElementReader, mcidPageMap map[int]string){
82 element := reader.Next()
83 for element.GetMp_elem().Swigcptr() != 0{ // Read page contents
84 // In this sample we process only text, but the code can be extended
85 // to handle paths, images, or other Element type.
86 mcid := element.GetStructMCID()
87
88 if mcid >= 0 && element.GetType() == ElementE_text{
89 val := element.GetTextString()
90 if _, ok := mcidPageMap[mcid]; ok {
91 mcidPageMap[mcid] = mcidPageMap[mcid] + val
92 }else{
93 mcidPageMap[mcid] = val
94 }
95 }
96 element = reader.Next()
97 }
98}
99
100// Used in code snippet 2.
101func ProcessElements(reader ElementReader){
102 element := reader.Next()
103 for element.GetMp_elem().Swigcptr() != 0{ // Read page contents
104 // In this sample we process only paths & text, but the code can be
105 // extended to handle any element type.
106 etype := element.GetType()
107 if (etype == ElementE_path ||
108 etype == ElementE_text ||
109 etype == ElementE_path){
110 if etype == ElementE_path{ // Process path ...
111 os.Stdout.Write([]byte("\nPATH: "))
112 }else if etype == ElementE_text{ // Process text ...
113 os.Stdout.Write([]byte("\nTEXT: " + element.GetTextString() + "\n"))
114 }else if etype == ElementE_path{ // Process from XObjects
115 os.Stdout.Write([]byte("\nFORM XObject: "))
116 }
117
118 // Check if the element is associated with any structural element.
119 // Content items are leaf nodes of the structure tree.
120 structParent := element.GetParentStructElement()
121 if structParent.IsValid(){
122 // Print out the parent structural element's type, title, and object number.
123 os.Stdout.Write([]byte(" Type: " + structParent.GetType() + ", MCID: " + strconv.Itoa(element.GetStructMCID())))
124 if structParent.HasTitle(){
125 os.Stdout.Write([]byte(". Title: " + structParent.GetTitle()))
126 }
127 os.Stdout.Write([]byte(", Obj//: " + strconv.Itoa(int(structParent.GetSDFObj().GetObjNum()))))
128 }
129 }
130 element = reader.Next()
131 }
132}
133
134func ProcessStructElement2(element SElement, mcidDocMap map[int](map[int]string), indent int){
135 if !element.IsValid(){
136 return
137 }
138 // Print out the type and title info, if any
139 PrintIndent(indent)
140 os.Stdout.Write([]byte("<" + element.GetType()))
141 if element.HasTitle(){
142 os.Stdout.Write([]byte(" title=\"" + element.GetTitle() + "\""))
143 }
144 os.Stdout.Write([]byte(">"))
145
146 num := element.GetNumKids()
147 i := 0
148 for i < num{
149 if element.IsContentItem(i){
150 cont := element.GetAsContentItem(i)
151 if cont.GetType() == ContentItemE_MCID{
152 pageNum := cont.GetPage().GetIndex()
153 if _, ok := mcidDocMap[pageNum]; ok{
154 mcidPageMap := mcidDocMap[pageNum]
155 mcidKey := cont.GetMCID()
156 if _, ok := mcidPageMap[mcidKey]; ok{
157 os.Stdout.Write([]byte(mcidPageMap[mcidKey]))
158 }
159 }
160 }
161 }else{ // the kid is another StructElement node.
162 ProcessStructElement2(element.GetAsStructElem(i), mcidDocMap, indent+1)
163 }
164 i = i + 1
165 }
166 PrintIndent(indent)
167 os.Stdout.Write([]byte("</" + element.GetType() + ">"))
168}
169
170func main(){
171 PDFNetInitialize(PDFTronLicense.Key)
172
173 // Relative path to the folder containing the test files.
174 inputPath := "../../TestFiles/"
175 outputPath := "../../TestFiles/Output/"
176
177 // Extract logical structure from a PDF document
178 doc := NewPDFDoc(inputPath + "tagged.pdf")
179 doc.InitSecurityHandler()
180
181 fmt.Println("____________________________________________________________")
182 fmt.Println("Sample 1 - Traverse logical structure tree...")
183
184 tree := doc.GetStructTree()
185 if tree.IsValid(){
186 fmt.Println("Document has a StructTree root.")
187
188 i := 0
189 for i < tree.GetNumKids(){
190 // Recursively get structure info for all child elements.
191 ProcessStructElement(tree.GetKid(i), 0)
192 i = i + 1
193 }
194 }else{
195 fmt.Println("This document does not contain any logical structure.")
196 }
197
198 fmt.Println("\nDone 1.")
199
200 fmt.Println("____________________________________________________________")
201 fmt.Println("Sample 2 - Get parent logical structure elements from")
202 fmt.Println("layout elements.")
203
204 reader := NewElementReader()
205 itr := doc.GetPageIterator()
206 for itr.HasNext(){
207 reader.Begin(itr.Current())
208 ProcessElements(reader)
209 reader.End()
210 itr.Next()
211 }
212
213 fmt.Println("\nDone 2.")
214
215 fmt.Println("____________________________________________________________")
216 fmt.Println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
217 // A map which maps page numbers(as Integers)
218 // to page Maps(which map from struct mcid(as Integers) to
219 // text Strings)
220 var mcidDocMap = make(map[int](map[int]string))
221 reader = NewElementReader()
222 itr = doc.GetPageIterator()
223 for itr.HasNext(){
224 reader.Begin(itr.Current())
225 var pageMcidMap = make(map[int]string)
226 mcidDocMap[itr.Current().GetIndex()] = pageMcidMap
227 ProcessElements2(reader, pageMcidMap)
228 reader.End()
229 itr.Next()
230 }
231 tree = doc.GetStructTree()
232 if tree.IsValid(){
233 i := 0
234 for i < tree.GetNumKids(){
235 ProcessStructElement2(tree.GetKid(i), mcidDocMap, 0)
236 i = i + 1
237 }
238 }
239 fmt.Println("\nDone 3.")
240 doc.Save(outputPath + "LogicalStructure.pdf", uint(SDFDocE_linearized))
241 doc.Close()
242 PDFNetTerminate()
243}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales