PDF Logical Structure Reader - Python Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using System.Collections;
8
9using pdftron;
10using pdftron.Common;
11using pdftron.Filters;
12using pdftron.SDF;
13using pdftron.PDF;
14using pdftron.PDF.Struct;
15
16
17namespace LogicalStructureTestCS
18{
19	//---------------------------------------------------------------------------------------
20	// This sample explores the structure and content of a tagged PDF document and dumps 
21	// the structure information to the console window.
22	//
23	// In tagged PDF documents StructTree acts as a central repository for information 
24	// related to a PDF document's logical structure. The tree consists of StructElement-s
25	// and ContentItem-s which are leaf nodes of the structure tree.
26	//
27	// The sample can be extended to access and extract the marked-content elements such 
28	// as text and images.
29	//---------------------------------------------------------------------------------------
30	class Class1
31	{
32		static void PrintIndent(int indent) { Console.WriteLine(); for (int i=0; i<indent; ++i) Console.Write("  "); }
33
34		// Used in code snippet 1.
35		static void ProcessStructElement(SElement element, int indent)
36		{
37			if (!element.IsValid()) {
38				return;
39			}
40
41			// Print out the type and title info, if any.
42			PrintIndent(indent++);
43			Console.Write("Type: " + element.GetType());
44			if (element.HasTitle()) {
45				Console.Write(". Title: "+ element.GetTitle());
46			}
47
48			int num = element.GetNumKids();
49			for (int i=0; i<num; ++i) 
50			{
51				// Check is the kid is a leaf node (i.e. it is a ContentItem).
52				if (element.IsContentItem(i)) { 
53					ContentItem cont = element.GetAsContentItem(i); 
54					ContentItem.Type type = cont.GetType();
55
56					Page page = cont.GetPage();
57
58					PrintIndent(indent);
59					Console.Write("Content Item. Part of page #" + page.GetIndex());
60
61					PrintIndent(indent);
62					switch (type) {
63						case ContentItem.Type.e_MCID:
64						case ContentItem.Type.e_MCR:
65							Console.Write("MCID: " + cont.GetMCID());
66							break;
67						case ContentItem.Type.e_OBJR:
68							{
69								Console.Write("OBJR ");
70								Obj ref_obj = cont.GetRefObj();
71								if (ref_obj!=null)
72									Console.Write("- Referenced Object#: " + ref_obj.GetObjNum());
73							}
74							break;
75						default: 
76							break;
77					}
78				}
79				else {  // the kid is another StructElement node.
80					ProcessStructElement(element.GetAsStructElem(i), indent);
81				}
82			}
83		}
84
85		// Used in code snippet 2.
86		static void ProcessElements(ElementReader reader)
87		{
88			Element element;
89			while ((element = reader.Next())!=null) 	// Read page contents
90			{
91				// In this sample we process only paths & text, but the code can be 
92				// extended to handle any element type.
93				Element.Type type = element.GetType();
94				if (type == Element.Type.e_path || type == Element.Type.e_text || type == Element.Type.e_path) 
95				{   
96					switch (type)	{
97					case Element.Type.e_path:               // Process path ...
98						Console.WriteLine();
99						Console.Write("PATH: ");
100						break; 
101					case Element.Type.e_text: 				// Process text ...
102						Console.WriteLine();
103						Console.WriteLine("TEXT: " + element.GetTextString());
104						break;
105					case Element.Type.e_form:				// Process form XObjects
106						Console.WriteLine();
107						Console.Write("FORM XObject: ");
108						//reader.FormBegin(); 
109						//ProcessElements(reader);
110						//reader.End(); 
111						break; 
112					}
113
114					// Check if the element is associated with any structural element.
115					// Content items are leaf nodes of the structure tree.
116					SElement struct_parent = element.GetParentStructElement();
117					if (struct_parent.IsValid()) {
118						// Print out the parent structural element's type, title, and object number.
119						Console.Write(" Type: " + struct_parent.GetType() 
120							+ ", MCID: " + element.GetStructMCID());
121						if (struct_parent.HasTitle()) {
122							Console.Write(". Title: "+ struct_parent.GetTitle());
123						}
124						Console.Write(", Obj#: " + struct_parent.GetSDFObj().GetObjNum());
125					}
126				}
127			}
128		}
129
130		// Used in code snippet 3.
131		//typedef map<int, string> MCIDPageMap;
132		//typedef map<int, MCIDPageMap> MCIDDocMap;
133
134		// Used in code snippet 3.
135		static void ProcessElements2(ElementReader reader, Hashtable mcid_page_map)
136		{
137			Element element;
138			while ((element = reader.Next())!=null) // Read page contents
139			{
140				// In this sample we process only text, but the code can be extended 
141				// to handle paths, images, or any other Element type.
142				int mcid = element.GetStructMCID();
143				if (mcid>= 0 && element.GetType() == Element.Type.e_text) {
144					String val = element.GetTextString();
145					if (mcid_page_map.ContainsKey(mcid)) mcid_page_map[mcid] = ((String)(mcid_page_map[mcid])+ val); 
146					else mcid_page_map.Add(mcid, val);
147				}
148			}
149		}
150
151		// Used in code snippet 3.
152		static void ProcessStructElement2(SElement element, Hashtable mcid_doc_map, int indent)
153		{
154			if (!element.IsValid()) {
155				return;
156			}
157
158			// Print out the type and title info, if any.
159			PrintIndent(indent);
160			Console.Write("<" + element.GetType());
161			if (element.HasTitle()) {
162				Console.Write(" title=\""+ element.GetTitle() + "\"");
163			}
164			Console.Write(">");
165
166			int num = element.GetNumKids();
167			for (int i=0; i<num; ++i) 
168			{		
169				if (element.IsContentItem(i)) { 
170					ContentItem cont = element.GetAsContentItem(i); 
171					if (cont.GetType() == ContentItem.Type.e_MCID) {
172						int page_num = cont.GetPage().GetIndex();
173						if (mcid_doc_map.ContainsKey(page_num)) {
174							Hashtable mcid_page_map = (Hashtable)(mcid_doc_map[page_num]);
175							int mcid = cont.GetMCID();
176							if (mcid_page_map.ContainsKey(mcid)) {
177								Console.Write(mcid_page_map[mcid]); 
178							}                    
179						}
180					}
181				}
182				else {  // the kid is another StructElement node.
183					ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1);
184				}
185			}
186
187			PrintIndent(indent);
188			Console.Write("</" + element.GetType() + ">");
189		}
190
191		private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
192		static Class1() {}
193
194		/// <summary>
195		/// The main entry point for the application.
196		/// </summary>
197		static void Main(string[] args)
198		{
199			PDFNet.Initialize(PDFTronLicense.Key);
200			// Relative path to the folder containing test files.
201			string input_path =  "../../../../TestFiles/";
202			string output_path = "../../../../TestFiles/Output/";
203
204			try  // Extract logical structure from a PDF document
205			{
206				using (PDFDoc doc = new PDFDoc(input_path + "tagged.pdf"))
207				{
208					doc.InitSecurityHandler();
209
210					bool example1 = true;
211					bool example2 = true;
212					bool example3 = true;
213
214					if (example1)
215					{
216						Console.WriteLine("____________________________________________________________");
217						Console.WriteLine("Sample 1 - Traverse logical structure tree...");
218
219						STree tree = doc.GetStructTree();
220						if (tree.IsValid()) 
221						{
222							Console.WriteLine("Document has a StructTree root.");
223							for (int i=0; i<tree.GetNumKids(); ++i) 
224							{
225								// Recursively get structure  info for all all child elements.
226								ProcessStructElement(tree.GetKid(i), 0);
227							}
228						}
229						else 
230						{
231							Console.WriteLine("This document does not contain any logical structure.");
232						}
233
234						Console.WriteLine();
235						Console.WriteLine("Done 1.");
236					}
237
238					if (example2)
239					{
240						Console.WriteLine("____________________________________________________________");
241						Console.WriteLine("Sample 2 - Get parent logical structure elements from");
242						Console.WriteLine("layout elements.");
243						
244						ElementReader reader=new ElementReader();
245						for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) 
246						{				
247							reader.Begin(itr.Current());
248							ProcessElements(reader);
249							reader.End();
250						}
251						Console.WriteLine();
252						Console.WriteLine("Done 2.");
253					}
254
255					if (example3)
256					{
257						Console.WriteLine("____________________________________________________________");
258						Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
259						
260						//A map which maps page numbers(as Integers)
261						//to page Maps(which map from struct mcid(as Integers) to
262						//text Strings)
263						Hashtable mcid_doc_map=new Hashtable();
264						ElementReader reader=new ElementReader();
265						for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) 
266						{				
267							Page pg = itr.Current();
268							reader.Begin(pg);
269							Hashtable page_mcid_map=new Hashtable();
270							mcid_doc_map.Add(pg.GetIndex(), page_mcid_map);
271							ProcessElements2(reader, page_mcid_map);
272							reader.End();
273						}
274						
275						STree tree = doc.GetStructTree();
276						if (tree.IsValid()) 
277						{
278							for (int i=0; i<tree.GetNumKids(); ++i) 
279							{
280								ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
281							}
282						}
283						Console.WriteLine();
284						Console.WriteLine("Done 3.");
285					}
286
287					doc.Save(output_path + "LogicalStructure.pdf", 0);
288				}
289			}
290			catch (PDFNetException e)
291			{
292				Console.WriteLine(e.Message);
293			}
294			PDFNet.Terminate();
295		}
296	}
297}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <iostream>
10#include <map>
11#include "../../LicenseKey/CPP/LicenseKey.h"
12
13using namespace pdftron;
14using namespace PDF;
15using namespace std;
16
17//---------------------------------------------------------------------------------------
18// This sample explores the structure and content of a tagged PDF document and dumps 
19// the structure information to the console window.
20//
21// In tagged PDF documents StructTree acts as a central repository for information 
22// related to a PDF document's logical structure. The tree consists of StructElement-s
23// and ContentItem-s which are leaf nodes of the structure tree.
24//
25// The sample can be extended to access and extract the marked-content elements such 
26// as text and images.
27//---------------------------------------------------------------------------------------
28
29
30void PrintIndent(int indent) { cout << '\n'; for (int i=0; i<indent; ++i) cout << "  "; }
31
32// Used in code snippet 1.
33void ProcessStructElement(Struct::SElement element, int ident) 
34{
35	if (!element.IsValid()) {
36		return;
37	}
38
39	// Print out the type and title info, if any.
40	PrintIndent(ident++);
41	cout << "Type: "<< element.GetType();
42	if (element.HasTitle()) {
43		cout << ". Title: "<< element.GetTitle();
44	}
45
46	int num = element.GetNumKids();
47	for (int i=0; i<num; ++i) 
48	{
49		// Check is the kid is a leaf node (i.e. it is a ContentItem).
50		if (element.IsContentItem(i)) { 
51			Struct::ContentItem cont = element.GetAsContentItem(i); 
52			Struct::ContentItem::Type type = cont.GetType();
53
54			Page page = cont.GetPage();
55
56			PrintIndent(ident);
57			cout << "Content Item. Part of page #" << page.GetIndex();
58
59			PrintIndent(ident);
60			switch (type) {
61				case Struct::ContentItem::e_MCID:
62				case Struct::ContentItem::e_MCR:
63					cout << "MCID: " << cont.GetMCID();
64					break;
65				case Struct::ContentItem::e_OBJR:
66					{
67						cout << "OBJR ";
68						if (SDF::Obj ref_obj = cont.GetRefObj())
69							cout << "- Referenced Object#: " << ref_obj.GetObjNum();
70					}
71					break;
72				default: 
73					break;
74			}
75		}
76		else {  // the kid is another StructElement node.
77			ProcessStructElement(element.GetAsStructElem(i), ident);
78		}
79	}
80}
81
82// Used in code snippet 2.
83void ProcessElements(ElementReader& reader) 
84{
85	Element element;
86	while (element = reader.Next()) 	// Read page contents
87	{
88		// In this sample we process only paths & text, but the code can be 
89		// extended to handle any element type.
90		Element::Type type = element.GetType();
91		if (type == Element::e_path || type == Element::e_text || type == Element::e_path) 
92		{   
93			switch (type)	{
94			case Element::e_path:				// Process path ...
95				cout << "\nPATH: ";
96				break; 
97			case Element::e_text: 				// Process text ...
98				cout << "\nTEXT: " << element.GetTextString() << endl;
99				break;
100			case Element::e_form:				// Process form XObjects
101				cout << "\nFORM XObject: ";
102				//reader.FormBegin(); 
103				//ProcessElements(reader);
104				//reader.End(); 
105				break; 
106			}
107
108			// Check if the element is associated with any structural element.
109			// Content items are leaf nodes of the structure tree.
110			Struct::SElement struct_parent = element.GetParentStructElement();
111			if (struct_parent.IsValid()) {
112				// Print out the parent structural element's type, title, and object number.
113				cout << " Type: " << struct_parent.GetType() 
114					<< ", MCID: " << element.GetStructMCID();
115				if (struct_parent.HasTitle()) {
116					cout << ". Title: "<< struct_parent.GetTitle();
117				}
118				cout << ", Obj#: " << struct_parent.GetSDFObj().GetObjNum();
119			}
120		}
121	}
122}
123
124// Used in code snippet 3.
125typedef map<int, string> MCIDPageMap;
126typedef map<int, MCIDPageMap> MCIDDocMap;
127
128// Used in code snippet 3.
129void ProcessElements2(ElementReader& reader, MCIDPageMap& mcid_page_map) 
130{
131	Element element;
132	while (element = reader.Next()) // Read page contents
133	{
134		// In this sample we process only text, but the code can be extended 
135		// to handle paths, images, or any other Element type.
136		int mcid = element.GetStructMCID();
137		if (mcid>= 0 && element.GetType() == Element::e_text) {
138			string val = element.GetTextString().ConvertToAscii();
139			MCIDPageMap::iterator itr = mcid_page_map.find(mcid);
140			if (itr != mcid_page_map.end()) itr->second += val; 
141			else mcid_page_map.insert(MCIDPageMap::value_type(mcid, val));
142		}
143	}
144}
145
146// Used in code snippet 3.
147void ProcessStructElement2(Struct::SElement element, MCIDDocMap& mcid_doc_map, int ident) 
148{
149	if (!element.IsValid()) {
150		return;
151	}
152
153	// Print out the type and title info, if any.
154	PrintIndent(ident);
155	cout << "<" << element.GetType();
156	if (element.HasTitle()) {
157		cout << " title=\""<< element.GetTitle() << "\"";
158	}
159	cout << ">";
160
161	int num = element.GetNumKids();
162	for (int i=0; i<num; ++i) 
163	{		
164		if (element.IsContentItem(i)) { 
165			Struct::ContentItem cont = element.GetAsContentItem(i); 
166			if (cont.GetType() == Struct::ContentItem::e_MCID) {
167				int page_num = cont.GetPage().GetIndex();
168				MCIDDocMap::iterator itr = mcid_doc_map.find(page_num);
169				if (itr!=mcid_doc_map.end()) {
170					MCIDPageMap& mcid_page_map = itr->second;
171					MCIDPageMap::iterator itr2 = mcid_page_map.find(cont.GetMCID());
172					if (itr2 != mcid_page_map.end()) {
173						cout << itr2->second; 
174					}                    
175				}
176			}
177		}
178		else {  // the kid is another StructElement node.
179			ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, ident+1);
180		}
181	}
182
183	PrintIndent(ident);
184	cout << "</" << element.GetType() << ">";
185}
186
187
188int main(int argc, char *argv[])
189{
190	int ret = 0;
191	PDFNet::Initialize(LicenseKey);
192
193	// Relative path to the folder containing test files.
194	string input_path =  "../../TestFiles/";
195	string output_path = "../../TestFiles/Output/";
196
197	try	// Extract logical structure from a PDF document
198	{
199		PDFDoc doc((input_path + "tagged.pdf").c_str());
200		doc.InitSecurityHandler();
201
202		cout << "____________________________________________________________" << endl;
203		cout << "Sample 1 - Traverse logical structure tree..." << endl;
204		{
205			Struct::STree tree = doc.GetStructTree();
206			if (tree.IsValid()) {
207				cout << "Document has a StructTree root." << endl;				
208
209				for (int i=0; i<tree.GetNumKids(); ++i) {
210					// Recursively get structure info for all child elements.
211					ProcessStructElement(tree.GetKid(i), 0);
212				}
213			}
214			else {
215				cout << "This document does not contain any logical structure." << endl;
216			}
217		}
218		cout << "\nDone 1." << endl;
219
220		cout << "____________________________________________________________" << endl;
221		cout << "Sample 2 - Get parent logical structure elements from" << endl;
222		cout << "layout elements." << endl;
223		{
224			ElementReader reader;
225			for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) {				
226				reader.Begin(itr.Current());
227				ProcessElements(reader);
228				reader.End();
229			}
230		}
231		cout << "\nDone 2." << endl;
232
233		cout << "____________________________________________________________" << endl;
234		cout << "Sample 3 - 'XML style' extraction of PDF logical structure and page content." << endl;
235		{
236			MCIDDocMap mcid_doc_map;
237			ElementReader reader;
238			for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) {				
239				reader.Begin(itr.Current());
240				pair<MCIDDocMap::iterator, bool> r = mcid_doc_map.insert(MCIDDocMap::value_type(itr.Current().GetIndex(), MCIDPageMap()));
241				MCIDPageMap& page_mcid_map = (r.first)->second;
242				ProcessElements2(reader, page_mcid_map);
243				reader.End();
244			}
245
246			Struct::STree tree = doc.GetStructTree();
247			if (tree.IsValid()) {
248				for (int i=0; i<tree.GetNumKids(); ++i) {
249					ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
250				}
251			}
252		}
253		cout << "\nDone 3." << endl;
254
255		doc.Save(output_path + "LogicalStructure.pdf", 0);
256	}
257	catch(Common::Exception& e) 
258	{
259		cout << e << endl;
260		ret = 1;
261	}
262	catch(...) 
263	{
264		cout << "Unknown Exception" << endl;
265		ret = 1;
266	}
267
268	PDFNet::Terminate();
269	return ret;
270}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8	"fmt"
9    "strconv"
10    "os"
11	. "pdftron"
12)
13
14import  "pdftron/Samples/LicenseKey/GO"
15
16//---------------------------------------------------------------------------------------
17// This sample explores the structure and content of a tagged PDF document and dumps 
18// the structure information to the console window.
19//
20// In tagged PDF documents StructTree acts as a central repository for information 
21// related to a PDF document's logical structure. The tree consists of StructElement-s
22// and ContentItem-s which are leaf nodes of the structure tree.
23//
24// The sample can be extended to access and extract the marked-content elements such 
25// as text and images.
26//---------------------------------------------------------------------------------------
27
28func PrintIndent(indent int){
29   os.Stdout.Write([]byte("\n"))
30   i := 0
31    for i < indent{
32        os.Stdout.Write([]byte("  "))
33        i = i + 1
34    }
35}
36
37func ProcessStructElement(element SElement, indent int){
38    if !element.IsValid(){
39        return
40    }
41
42    // Print out the type and title info, if any.
43    PrintIndent(indent)
44    indent = indent + 1
45    os.Stdout.Write([]byte("Type: " + element.GetType()))
46    if element.HasTitle(){
47        os.Stdout.Write([]byte(". Title:" + element.GetTitle()))
48    }
49    num := element.GetNumKids()
50    i := 0
51    for i < num{
52        // Check if the kid is a leaf node (i.e. it is a ContentItem)
53        if element.IsContentItem(i){
54            cont := element.GetAsContentItem(i)
55            etype := cont.GetType()
56            
57            page := cont.GetPage()
58            
59            PrintIndent(indent)
60            os.Stdout.Write([]byte("Content Item. Part of page //" + strconv.Itoa(page.GetIndex())))
61            PrintIndent(indent)
62            if etype == ContentItemE_MCID{
63                os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
64            }else if etype == ContentItemE_MCR{
65                os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
66            }else if etype == ContentItemE_OBJR{
67                os.Stdout.Write([]byte("OBJR "))
68                refObj := cont.GetRefObj()
69                if refObj != nil{
70                    os.Stdout.Write([]byte("- Referenced Object//: " + strconv.Itoa(int(refObj.GetObjNum()))))
71                }
72            }
73        }else{
74            ProcessStructElement(element.GetAsStructElem(i), indent)
75        }
76        i = i + 1
77    }
78}    
79
80// Used in code snippet 3.
81func ProcessElements2(reader ElementReader, mcidPageMap map[int]string){
82    element := reader.Next()
83    for element.GetMp_elem().Swigcptr() != 0{ // Read page contents
84        // In this sample we process only text, but the code can be extended
85        // to handle paths, images, or other Element type.
86        mcid := element.GetStructMCID()
87        
88        if mcid >= 0 && element.GetType() == ElementE_text{
89            val := element.GetTextString()
90            if _, ok := mcidPageMap[mcid]; ok {
91                mcidPageMap[mcid] = mcidPageMap[mcid] + val
92            }else{
93                mcidPageMap[mcid] = val
94            }
95        }
96        element = reader.Next()
97    }
98}
99
100// Used in code snippet 2.
101func ProcessElements(reader ElementReader){
102    element := reader.Next()
103    for element.GetMp_elem().Swigcptr() != 0{  // Read page contents
104        // In this sample we process only paths & text, but the code can be 
105        // extended to handle any element type.
106        etype := element.GetType()
107        if (etype == ElementE_path ||
108            etype == ElementE_text ||
109            etype == ElementE_path){
110            if etype == ElementE_path{      // Process path ...
111                os.Stdout.Write([]byte("\nPATH: "))
112            }else if etype == ElementE_text{    // Process text ...
113                os.Stdout.Write([]byte("\nTEXT: " + element.GetTextString() + "\n"))
114            }else if etype == ElementE_path{    // Process from XObjects
115                os.Stdout.Write([]byte("\nFORM XObject: "))
116            }
117
118            // Check if the element is associated with any structural element.
119            // Content items are leaf nodes of the structure tree.
120            structParent := element.GetParentStructElement()
121            if structParent.IsValid(){
122                // Print out the parent structural element's type, title, and object number.
123                os.Stdout.Write([]byte(" Type: " + structParent.GetType() + ", MCID: " + strconv.Itoa(element.GetStructMCID())))
124                if structParent.HasTitle(){
125                    os.Stdout.Write([]byte(". Title: " + structParent.GetTitle()))
126                }
127                os.Stdout.Write([]byte(", Obj//: " + strconv.Itoa(int(structParent.GetSDFObj().GetObjNum()))))
128            }
129        }
130        element = reader.Next()
131    }
132}        
133        
134func ProcessStructElement2(element SElement, mcidDocMap map[int](map[int]string), indent int){
135    if !element.IsValid(){
136        return
137    }
138    // Print out the type and title info, if any
139    PrintIndent(indent)
140    os.Stdout.Write([]byte("<" + element.GetType()))
141    if element.HasTitle(){
142        os.Stdout.Write([]byte(" title=\"" + element.GetTitle() + "\""))
143    }
144    os.Stdout.Write([]byte(">"))
145    
146    num := element.GetNumKids()
147    i := 0
148    for i < num{
149        if element.IsContentItem(i){
150            cont := element.GetAsContentItem(i)
151            if cont.GetType() == ContentItemE_MCID{
152                pageNum := cont.GetPage().GetIndex()
153                if _, ok := mcidDocMap[pageNum]; ok{
154                    mcidPageMap := mcidDocMap[pageNum]
155                    mcidKey := cont.GetMCID()
156                    if _, ok := mcidPageMap[mcidKey]; ok{
157                        os.Stdout.Write([]byte(mcidPageMap[mcidKey]))
158                    }
159                }
160            }
161        }else{ // the kid is another StructElement node.
162            ProcessStructElement2(element.GetAsStructElem(i), mcidDocMap, indent+1)     
163        } 
164        i = i + 1
165    }
166    PrintIndent(indent)
167    os.Stdout.Write([]byte("</" + element.GetType() + ">"))
168}        
169
170func main(){
171    PDFNetInitialize(PDFTronLicense.Key)
172    
173    // Relative path to the folder containing the test files.
174    inputPath := "../../TestFiles/"
175    outputPath := "../../TestFiles/Output/"
176    
177    // Extract logical structure from a PDF document
178    doc := NewPDFDoc(inputPath + "tagged.pdf")
179    doc.InitSecurityHandler()
180    
181    fmt.Println("____________________________________________________________")
182    fmt.Println("Sample 1 - Traverse logical structure tree...")
183    
184    tree := doc.GetStructTree()
185    if tree.IsValid(){
186        fmt.Println("Document has a StructTree root.")
187        
188        i := 0
189        for i < tree.GetNumKids(){
190            // Recursively get structure info for all child elements.
191            ProcessStructElement(tree.GetKid(i), 0)
192            i = i + 1
193        }
194    }else{
195        fmt.Println("This document does not contain any logical structure.")
196    }
197
198    fmt.Println("\nDone 1.")
199
200    fmt.Println("____________________________________________________________")
201    fmt.Println("Sample 2 - Get parent logical structure elements from")
202    fmt.Println("layout elements.")
203    
204    reader := NewElementReader()
205    itr := doc.GetPageIterator()
206    for itr.HasNext(){
207        reader.Begin(itr.Current())
208        ProcessElements(reader)
209        reader.End()
210        itr.Next()
211    }
212
213    fmt.Println("\nDone 2.")
214    
215    fmt.Println("____________________________________________________________")
216    fmt.Println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
217    // A map which maps page numbers(as Integers)
218    // to page Maps(which map from struct mcid(as Integers) to
219    // text Strings)
220    var mcidDocMap = make(map[int](map[int]string))
221    reader = NewElementReader()
222    itr = doc.GetPageIterator()
223    for itr.HasNext(){
224        reader.Begin(itr.Current())
225        var pageMcidMap = make(map[int]string)
226        mcidDocMap[itr.Current().GetIndex()] = pageMcidMap
227        ProcessElements2(reader, pageMcidMap)
228        reader.End()
229        itr.Next() 
230    } 
231    tree = doc.GetStructTree()
232    if tree.IsValid(){
233        i := 0
234        for i < tree.GetNumKids(){
235            ProcessStructElement2(tree.GetKid(i), mcidDocMap, 0)
236            i = i + 1  
237        }
238    }
239    fmt.Println("\nDone 3.")
240    doc.Save(outputPath + "LogicalStructure.pdf", uint(SDFDocE_linearized))
241    doc.Close()        
242    PDFNetTerminate()
243}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.util.Map;
7import java.util.TreeMap;
8
9import com.pdftron.common.PDFNetException;
10import com.pdftron.pdf.struct.*;
11import com.pdftron.pdf.*;
12import com.pdftron.sdf.*;
13
14//---------------------------------------------------------------------------------------
15// This sample explores the structure and content of a tagged PDF document and dumps 
16// the structure information to the console window.
17//
18// In tagged PDF documents StructTree acts as a central repository for information 
19// related to a PDF document's logical structure. The tree consists of StructElement-s
20// and ContentItem-s which are leaf nodes of the structure tree.
21//
22// The sample can be extended to access and extract the marked-content elements such 
23// as text and images.
24//---------------------------------------------------------------------------------------
25public class LogicalStructureTest {
26    static void PrintIndent(int indent) {
27        System.out.println();
28        for (int i = 0; i < indent; ++i) System.out.print("  ");
29    }
30
31    // Used in code snippet 1.
32    static void ProcessStructElement(SElement element, int indent) throws PDFNetException {
33        if (!element.isValid()) {
34            return;
35        }
36
37        // Print out the type and title info, if any.
38        PrintIndent(indent++);
39        System.out.print("Type: " + element.getType());
40        if (element.hasTitle()) {
41            System.out.print(". Title: " + element.getTitle());
42        }
43
44        int num = element.getNumKids();
45        for (int i = 0; i < num; ++i) {
46            // Check is the kid is a leaf node (i.e. it is a ContentItem).
47            if (element.isContentItem(i)) {
48                ContentItem cont = element.getAsContentItem(i);
49                int type = cont.getType();
50
51                Page page = cont.getPage();
52
53                PrintIndent(indent);
54                System.out.print("Content Item. Part of page #" + page.getIndex());
55
56                PrintIndent(indent);
57                switch (type) {
58                    case ContentItem.e_MCID:
59                    case ContentItem.e_MCR:
60                        System.out.print("MCID: " + cont.getMCID());
61                        break;
62                    case ContentItem.e_OBJR: {
63                        System.out.print("OBJR ");
64                        Obj ref_obj = cont.getRefObj();
65                        if (ref_obj != null)
66                            System.out.print("- Referenced Object#: " + ref_obj.getObjNum());
67                    }
68                    break;
69                    default:
70                        break;
71                }
72            } else {  // the kid is another StructElement node.
73                ProcessStructElement(element.getAsStructElem(i), indent);
74            }
75        }
76    }
77
78    // Used in code snippet 2.
79    static void ProcessElements(ElementReader reader) throws PDFNetException {
80        Element element;
81        while ((element = reader.next()) != null)    // Read page contents
82        {
83            // In this sample we process only paths & text, but the code can be
84            // extended to handle any element type.
85            int type = element.getType();
86            if (type == Element.e_path || type == Element.e_text || type == Element.e_path) {
87                switch (type) {
88                    case Element.e_path:                // Process path ...
89                        System.out.print("\nPATH: ");
90                        break;
91                    case Element.e_text:                // Process text ...
92                        System.out.print("\nTEXT: " + element.getTextString() + "\n");
93                        break;
94                    case Element.e_form:                // Process form XObjects
95                        System.out.print("\nFORM XObject: ");
96                        //reader.FormBegin();
97                        //ProcessElements(reader);
98                        //reader.End();
99                        break;
100                }
101
102                // Check if the element is associated with any structural element.
103                // Content items are leaf nodes of the structure tree.
104                SElement struct_parent = element.getParentStructElement();
105                if (struct_parent.isValid()) {
106                    // Print out the parent structural element's type, title, and object number.
107                    System.out.print(" Type: " + struct_parent.getType()
108                            + ", MCID: " + element.getStructMCID());
109                    if (struct_parent.hasTitle()) {
110                        System.out.print(". Title: " + struct_parent.getTitle());
111                    }
112                    System.out.print(", Obj#: " + struct_parent.getSDFObj().getObjNum());
113                }
114            }
115        }
116    }
117
118    // Used in code snippet 3.
119    //typedef map<int, string> MCIDPageMap;
120    //typedef map<int, MCIDPageMap> MCIDDocMap;
121
122    // Used in code snippet 3.
123    static void ProcessElements2(ElementReader reader, Map<Integer, String> mcid_page_map) throws PDFNetException {
124        Element element;
125        while ((element = reader.next()) != null) // Read page contents
126        {
127            // In this sample we process only text, but the code can be extended
128            // to handle paths, images, or any other Element type.
129            int mcid = element.getStructMCID();
130            Integer key_mcid = new Integer(mcid);
131            if (mcid >= 0 && element.getType() == Element.e_text) {
132                String val = element.getTextString();
133                if (mcid_page_map.containsKey(key_mcid))
134                    mcid_page_map.put(key_mcid, ((String) (mcid_page_map.get(key_mcid)) + val));
135                else mcid_page_map.put(key_mcid, val);
136            }
137        }
138    }
139
140    // Used in code snippet 3.
141    static void ProcessStructElement2(SElement element, Map<Integer, Map<Integer, String>> mcid_doc_map, int indent) throws PDFNetException {
142        if (!element.isValid()) {
143            return;
144        }
145
146        // Print out the type and title info, if any.
147        PrintIndent(indent);
148        System.out.print("<" + element.getType());
149        if (element.hasTitle()) {
150            System.out.print(" title=\"" + element.getTitle() + "\"");
151        }
152        System.out.print(">");
153
154        int num = element.getNumKids();
155        for (int i = 0; i < num; ++i) {
156            if (element.isContentItem(i)) {
157                ContentItem cont = element.getAsContentItem(i);
158                if (cont.getType() == ContentItem.e_MCID) {
159                    int page_num = cont.getPage().getIndex();
160                    Integer page_num_key = new Integer(page_num);
161                    if (mcid_doc_map.containsKey(page_num_key)) {
162                        Map<Integer, String> mcid_page_map = mcid_doc_map.get(page_num_key);
163                        Integer mcid_key = new Integer(cont.getMCID());
164                        if (mcid_page_map.containsKey(mcid_key)) {
165                            System.out.print(mcid_page_map.get(mcid_key));
166                        }
167                    }
168                }
169            } else {  // the kid is another StructElement node.
170                ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1);
171            }
172        }
173
174        PrintIndent(indent);
175        System.out.print("</" + element.getType() + ">");
176    }
177
178
179    /**
180     * @param args
181     */
182    public static void main(String[] args) {
183        PDFNet.initialize(PDFTronLicense.Key());
184
185        // Relative path to the folder containing test files.
186        String input_path = "../../TestFiles/";
187        String output_path = "../../TestFiles/Output/";
188
189        try (PDFDoc doc = new PDFDoc((input_path + "tagged.pdf")))    // Extract logical structure from a PDF document
190        {
191            doc.initSecurityHandler();
192
193            System.out.println("____________________________________________________________");
194            System.out.println("Sample 1 - Traverse logical structure tree...");
195            {
196                STree tree = doc.getStructTree();
197                if (tree.isValid()) {
198                    System.out.println("Document has a StructTree root.");
199
200                    for (int i = 0; i < tree.getNumKids(); ++i) {
201                        // Recursively get structure  info for all all child elements.
202                        ProcessStructElement(tree.getKid(i), 0);
203                    }
204                } else {
205                    System.out.println("This document does not contain any logical structure.");
206                }
207            }
208            System.out.println("\nDone 1.");
209
210            System.out.println("____________________________________________________________");
211            System.out.println("Sample 2 - Get parent logical structure elements from");
212            System.out.println("layout elements.");
213            {
214                ElementReader reader = new ElementReader();
215                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
216                    reader.begin(itr.next());
217                    ProcessElements(reader);
218                    reader.end();
219                }
220            }
221            System.out.println("\nDone 2.");
222
223            System.out.println("____________________________________________________________");
224            System.out.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
225            {
226                //A map which maps page numbers(as Integers)
227                //to page Maps(which map from struct mcid(as Integers) to
228                //text Strings)
229                Map<Integer, Map<Integer, String>> mcid_doc_map = new TreeMap<Integer, Map<Integer, String>>();
230                ElementReader reader = new ElementReader();
231                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
232                    Page current = itr.next();
233                    reader.begin(current);
234                    Map<Integer, String> page_mcid_map = new TreeMap<Integer, String>();
235                    mcid_doc_map.put(new Integer(current.getIndex()), page_mcid_map);
236                    ProcessElements2(reader, page_mcid_map);
237                    reader.end();
238                }
239
240                STree tree = doc.getStructTree();
241                if (tree.isValid()) {
242                    for (int i = 0; i < tree.getNumKids(); ++i) {
243                        ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0);
244                    }
245                }
246            }
247            System.out.println("\nDone 3.");
248            doc.save((output_path + "LogicalStructure.pdf"), SDFDoc.SaveMode.LINEARIZED, null);
249        } catch (Exception e) {
250            e.printStackTrace();
251        }
252
253        PDFNet.terminate();
254    }
255
256}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// This sample explores the structure and content of a tagged PDF document and dumps
8// the structure information to the console window.
9//
10// In tagged PDF documents StructTree acts as a central repository for information
11// related to a PDF document's logical structure. The tree consists of StructElement-s
12// and ContentItem-s which are leaf nodes of the structure tree.
13//
14// The sample can be extended to access and extract the marked-content elements such
15// as text and images.
16//---------------------------------------------------------------------------------------
17
18
19const { PDFNet } = require('@pdftron/pdfnet-node');
20const PDFTronLicense = require('../LicenseKey/LicenseKey');
21
22((exports) => {
23
24  exports.runLogicalStructureTest = () => {
25
26    const printAndIndent = (printState, indent) => {
27      console.log(printState.str);
28
29      let indentStr = '';
30      for (let i = 0; i < indent; ++i) {
31        indentStr += '  ';
32      }
33    printState.str = indentStr;
34    };
35
36    // Used in code snippet 1.
37    const processStructElement = async(element, indent, printState) => {
38      if (!(await element.isValid())) {
39        return;
40      }
41
42
43      // Print out the type and title info, if any.
44      printAndIndent(printState, indent++);
45      printState.str += 'Type: ' + (await element.getType());
46      if (await element.hasTitle()) {
47        printState.str += '. Title: ' + (await element.getTitle());
48      }
49
50      const num = await element.getNumKids();
51      for (let i = 0; i < num; ++i) {
52        // Check is the kid is a leaf node (i.e. it is a ContentItem).
53        if (await element.isContentItem(i)) {
54          const cont = await element.getAsContentItem(i);
55          const type = await cont.getType();
56
57          const page = await cont.getPage();
58
59          printAndIndent(printState, indent);
60          printState.str += 'Content Item. Part of page #' + (await page.getIndex());
61
62          printAndIndent(printState, indent);
63          switch (type) {
64            case PDFNet.ContentItem.Type.e_MCID:
65            case PDFNet.ContentItem.Type.e_MCR:
66              printState.str += 'MCID: ' + (await cont.getMCID());
67              break;
68            case PDFNet.ContentItem.Type.e_OBJR:
69              {
70                printState.str += 'OBJR ';
71                const refObj = await cont.getRefObj();
72                if (refObj) {
73                  printState.str += '- Referenced Object#: ' + refObj.getObjNum();
74                }
75              }
76              break;
77            default:
78              break;
79          }
80        } else { // the kid is another StructElement node.
81          await processStructElement(await element.getAsStructElem(i), indent, printState);
82        }
83      }
84    };
85
86    // Used in code snippet 2.
87    const processElements = async(reader, printState) => {
88      let element;
89      while (element = await reader.next()) { // Read page contents
90        // In this sample we process only paths & text, but the code can be
91        // extended to handle any element type.
92        const type = await element.getType();
93        if (type === PDFNet.Element.Type.e_path || type === PDFNet.Element.Type.e_text || type === PDFNet.Element.Type.e_path) {
94          switch (type) {
95            case PDFNet.Element.Type.e_path: // Process path ...
96              printState.str += '\nPATH: ';
97              break;
98            case PDFNet.Element.Type.e_text: // Process text ...
99              printState.str += '\nTEXT: ' + (await element.getTextString()) + '\n';
100              break;
101            case PDFNet.Element.Type.e_form: // Process form XObjects
102              printState.str += '\nFORM XObject: ';
103              // reader.formBegin();
104              // await ProcessElements(reader);
105              // reader.end();
106              break;
107          }
108
109          // Check if the element is associated with any structural element.
110          // Content items are leaf nodes of the structure tree.
111          const structParent = await element.getParentStructElement();
112          if (await structParent.isValid()) {
113            // Print out the parent structural element's type, title, and object number.
114            printState.str += ' Type: ' + (await structParent.getType()) + ', MCID: ' + (await element.getStructMCID());
115            if (await structParent.hasTitle()) {
116              printState.str += '. Title: ' + (await structParent.getTitle());
117            }
118            printState.str += ', Obj#: ' + (await (await structParent.getSDFObj()).getObjNum());
119          }
120        }
121      }
122    };
123
124    // Used in code snippet 3.
125    const processElements2 = async(reader, mcidPageMap) => {
126      let element;
127      while (element = await reader.next()) { // Read page contents
128        // In this sample we process only text, but the code can be extended
129        // to handle paths, images, or any other Element type.
130        const mcid = await element.getStructMCID();
131        if (mcid >= 0 && (await element.getType()) === PDFNet.Element.Type.e_text) {
132          const val = await element.getTextString();
133          if (mcid in mcidPageMap) {
134            mcidPageMap[mcid] += val;
135          } else {
136            mcidPageMap[mcid] = val;
137          }
138        }
139      }
140    };
141
142    // Used in code snippet 3.
143    const processStructElement2 = async(element, mcidDocMap, indent, printState) => {
144      if (!(await element.isValid())) {
145        return;
146      }
147
148      // Print out the type and title info, if any.
149      printAndIndent(printState, indent);
150      printState.str += '<' + (await element.getType());
151      if (await element.hasTitle()) {
152        printState.str += ' title="' + (await element.getTitle()) + '"';
153      }
154      printState.str += '>';
155
156      const num = await element.getNumKids();
157      for (let i = 0; i < num; ++i) {
158        if (await element.isContentItem(i)) {
159          const cont = await element.getAsContentItem(i);
160          if ((await cont.getType()) === PDFNet.ContentItem.Type.e_MCID) {
161            const pageNum = await (await cont.getPage()).getIndex();
162            const mcidPageMap = mcidDocMap[pageNum];
163            if (mcidPageMap) {
164              const mcid = await cont.getMCID();
165              if (mcid in mcidPageMap) {
166                printState.str += mcidPageMap[mcid];
167              }
168            }
169          }
170        } else { // the kid is another StructElement node.
171          await processStructElement2(await element.getAsStructElem(i), mcidDocMap, indent + 1, printState);
172        }
173      }
174
175      printAndIndent(printState, indent);
176      printState.str += '</' + (await element.getType()) + '>';
177    };
178
179    const main = async() => {
180      // Relative path to the folder containing test files.
181      const inputPath = '../TestFiles/';
182      const printState = { str: '' };
183      try { // Extract logical structure from a PDF document
184        const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'tagged.pdf');
185        doc.initSecurityHandler();
186
187        let reader = null;
188        let tree = null;
189
190        console.log('____________________________________________________________');
191        console.log('Sample 1 - Traverse logical structure tree...');
192        tree = await doc.getStructTree();
193        if (await tree.isValid()) {
194          console.log('Document has a StructTree root.');
195          for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
196            // Recursively get structure info for all child elements.
197            await processStructElement(await tree.getKid(i), 0, printState);
198          }
199        } else {
200          console.log('This document does not contain any logical structure.');
201        }
202        printAndIndent(printState, 0);
203        console.log('Done 1.');
204
205        console.log('____________________________________________________________');
206        console.log('Sample 2 - Get parent logical structure elements from');
207        console.log('layout elements.');
208        reader = await PDFNet.ElementReader.create();
209        for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
210          reader.beginOnPage(await itr.current());
211          await processElements(reader, printState);
212          reader.end();
213        }
214        printAndIndent(printState, 0);
215        console.log('Done 2.');
216
217        console.log('____________________________________________________________');
218        console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
219        {
220          const mcidDocMap = {};
221          for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
222            const page = await itr.current();
223            reader.beginOnPage(page);
224            const pageNum = await page.getIndex();
225            const pageMcidMap = {};
226            mcidDocMap[pageNum] = pageMcidMap;
227            await processElements2(reader, pageMcidMap);
228            reader.end();
229          }
230
231          tree = await doc.getStructTree();
232          if (await tree.isValid()) {
233            for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
234              await processStructElement2(await tree.getKid(i), mcidDocMap, 0, printState);
235            }
236          }
237        }
238        printAndIndent(printState, 0);
239        console.log('Done 3.');
240        await doc.save(inputPath + 'Output/LogicalStructure.pdf', 0);
241      } catch (err) {
242        console.log(err);
243      }
244    };
245
246    PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function(error){console.log('Error: ' + JSON.stringify(error));}).then(function(){return PDFNet.shutdown();});
247  };
248  exports.runLogicalStructureTest();
249})(exports);
250// eslint-disable-next-line spaced-comment
251//# sourceURL=LogicalStructureTest.js

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/";
12$output_path = $input_path."Output/";
13
14//---------------------------------------------------------------------------------------
15// This sample explores the structure and content of a tagged PDF document and dumps 
16// the structure information to the console window.
17//
18// In tagged PDF documents StructTree acts as a central repository for information 
19// related to a PDF document's logical structure. The tree consists of StructElement-s
20// and ContentItem-s which are leaf nodes of the structure tree.
21//
22// The sample can be extended to access and extract the marked-content elements such 
23// as text and images.
24//---------------------------------------------------------------------------------------
25
26function PrintIdent($ident) { echo nl2br("\n"); for ($i=0; $i<$ident; ++$i) echo "  "; }
27
28// Used in code snippet 1.
29function ProcessStructElement($element, $ident) 
30{
31	if (!$element->IsValid()) {
32		return;
33	}
34
35	// Print out the type and title info, if any.
36	PrintIdent($ident++);
37	echo "Type: ".$element->GetType();
38	if ($element->HasTitle()) {
39		echo ". Title: ".$element->GetTitle();
40	}
41
42	$num = $element->GetNumKids();
43	for ($i=0; $i<$num; ++$i) 
44	{
45		// Check is the kid is a leaf node (i.e. it is a ContentItem).
46		if ($element->IsContentItem($i)) { 
47			$cont = $element->GetAsContentItem($i); 
48			$type = $cont->GetType();
49
50			$page = $cont->GetPage();
51
52			PrintIdent($ident);
53			echo "Content Item. Part of page #".$page->GetIndex();
54
55			PrintIdent($ident);
56			switch ($type) {
57				case ContentItem::e_MCID:
58				case ContentItem::e_MCR:
59					echo "MCID: ".$cont->GetMCID();
60					break;
61				case ContentItem::e_OBJR:
62					{
63						echo "OBJR ";
64						if ($ref_obj = $cont->GetRefObj())
65							echo "- Referenced Object#: ".$ref_obj->GetObjNum();
66					}
67					break;
68				default: 
69					break;
70			}
71		}
72		else {  // the kid is another StructElement node.
73			ProcessStructElement($element->GetAsStructElem($i), $ident);
74		}
75	}
76}
77
78// Used in code snippet 2.
79function ProcessElements($reader) 
80{
81	while ($element = $reader->Next()) 	// Read page contents
82	{
83		// In this sample we process only paths & text, but the code can be 
84		// extended to handle any element type.
85		$type = $element->GetType();
86		if ($type == Element::e_path || $type == Element::e_text || $type == Element::e_path) 
87		{   
88			switch ($type)	{
89			case Element::e_path:				// Process path ...
90				echo nl2br("\nPATH: ");
91				break; 
92			case Element::e_text: 				// Process text ...
93				echo nl2br("\nTEXT: ".$element->GetTextString()."\n");
94				break;
95			case Element::e_form:				// Process form XObjects
96				echo nl2br("\nFORM XObject: ");
97				//$reader->FormBegin(); 
98				//ProcessElements($reader);
99				//$reader->End(); 
100				break; 
101			}
102
103			// Check if the element is associated with any structural element.
104			// Content items are leaf nodes of the structure tree.
105			$struct_parent = $element->GetParentStructElement();
106			if ($struct_parent->IsValid()) {
107				// Print out the parent structural element's type, title, and object number.
108				echo " Type: ".$struct_parent->GetType() 
109					.", MCID: ".$element->GetStructMCID();
110				if ($struct_parent->HasTitle()) {
111					echo ". Title: ".$struct_parent->GetTitle();
112				}
113				echo ", Obj#: ".$struct_parent->GetSDFObj()->GetObjNum();
114			}
115		}
116	}
117}
118
119// Used in code snippet 3.
120function ProcessElements2($reader, &$mcid_page_map) 
121{
122	while (($element = $reader->Next()) != null) // Read page contents
123	{
124		// In this sample we process only text, but the code can be extended 
125		// to handle paths, images, or any other Element type.
126		$mcid = $element->GetStructMCID();
127		if ($mcid>= 0 && $element->GetType() == Element::e_text) {
128			$val = $element->GetTextString();
129			$exist = array_key_exists($mcid, $mcid_page_map);
130			if ($exist == true) {
131				$mcid_page_map[$mcid] = $mcid_page_map[$mcid].$val;
132			}
133			else {
134				$mcid_page_map[$mcid] = $val;
135			}
136		}
137	}
138}
139
140// Used in code snippet 3.
141function ProcessStructElement2($element, &$mcid_doc_map, $ident) 
142{
143	if (!$element->IsValid()) {
144		return;
145	}
146
147	// Print out the type and title info, if any.
148	PrintIdent($ident);
149	echo "<".$element->GetType();
150	if ($element->HasTitle()) {
151		echo " title=\"".$element->GetTitle()."\"";
152	}
153	echo ">";
154
155	$num = $element->GetNumKids();
156	for ($i=0; $i<$num; ++$i) 
157	{		
158		if ($element->IsContentItem($i)) { 
159			$cont = $element->GetAsContentItem($i); 
160			if ($cont->GetType() == ContentItem::e_MCID) {
161				$page_num = $cont->GetPage()->GetIndex();
162				if (array_key_exists($page_num, $mcid_doc_map)) {
163					$mcid_page_map = $mcid_doc_map[$page_num];
164					if (array_key_exists($cont->GetMCID(), $mcid_page_map)) {
165						echo $mcid_page_map[$cont->GetMCID()]; 
166					}                    
167				}
168			}
169		}
170		else {  // the kid is another StructElement node.
171			ProcessStructElement2($element->GetAsStructElem($i), $mcid_doc_map, $ident+1);
172		}
173	}
174
175	PrintIdent($ident);
176	echo "</".$element->GetType().">";
177}
178
179	PDFNet::Initialize($LicenseKey);
180	PDFNet::GetSystemFontList();    // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
181
182	// Extract logical structure from a PDF document
183
184	$doc = new PDFDoc($input_path."tagged.pdf");
185	$doc->InitSecurityHandler();
186
187	echo nl2br("____________________________________________________________\n");
188	echo nl2br("Sample 1 - Traverse logical structure tree...\n");
189
190	$tree = $doc->GetStructTree();
191	if ($tree->IsValid()) {
192		echo nl2br("Document has a StructTree root.\n");
193
194		for ($i=0; $i<$tree->GetNumKids(); ++$i) {
195			// Recursively get structure info for all child elements.
196			ProcessStructElement($tree->GetKid($i), 0);
197		}
198	}
199	else {
200		echo nl2br("This document does not contain any logical structure.\n");
201	}
202
203	echo nl2br("\nDone 1.\n");
204
205	echo nl2br("____________________________________________________________\n");
206	echo nl2br("Sample 2 - Get parent logical structure elements from\n");
207	echo nl2br("layout elements.\n");
208	
209	$reader = new ElementReader();
210	for ($itr = $doc->GetPageIterator(); $itr->HasNext(); $itr->Next()) {				
211		$reader->Begin($itr->Current());
212		ProcessElements($reader);
213		$reader->End();
214	}
215	
216	echo nl2br("\nDone 2.\n");
217
218	echo nl2br("____________________________________________________________\n");
219	echo nl2br("Sample 3 - 'XML style' extraction of PDF logical structure and page content.\n");
220	
221	$mcid_doc_map = array();
222	$reader = new ElementReader();
223	for ($itr = $doc->GetPageIterator(); $itr->HasNext(); $itr->Next()) {				
224		$reader->Begin($itr->Current());
225		$mcid_doc_map[$itr->Current()->GetIndex()] = array();
226		ProcessElements2($reader, $mcid_doc_map[$itr->Current()->GetIndex()]);
227		$reader->End();
228	}
229	$tree = $doc->GetStructTree();
230	if ($tree->IsValid()) {
231		for ($i=0; $i<$tree->GetNumKids(); ++$i) {
232			ProcessStructElement2($tree->GetKid($i), $mcid_doc_map, 0);
233		}
234	}
235	
236	echo nl2br("\nDone 3.\n");	
237	$doc->Save(($output_path ."LogicalStructure.pdf"), SDFDoc::e_linearized);
238	$doc->Close();     
239	PDFNet::Terminate();   
240?>

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14#---------------------------------------------------------------------------------------
15# This sample explores the structure and content of a tagged PDF document and dumps 
16# the structure information to the console window.
17#
18# In tagged PDF documents StructTree acts as a central repository for information 
19# related to a PDF document's logical structure. The tree consists of StructElement-s
20# and ContentItem-s which are leaf nodes of the structure tree.
21#
22# The sample can be extended to access and extract the marked-content elements such 
23# as text and images.
24#---------------------------------------------------------------------------------------
25
26def PrintIndent(indent):
27    sys.stdout.write("\n")
28    i = 0
29    while i < indent:
30        sys.stdout.write("  ")
31        i = i + 1
32        
33def ProcessStructElement(element, indent):
34    if not element.IsValid():
35        return
36    
37    # Print out the type and title info, if any.
38    PrintIndent(indent)
39    indent = indent + 1
40    sys.stdout.write("Type: " + element.GetType())
41    if element.HasTitle():
42        sys.stdout.write(". Title:" + element.GetTitle())
43    
44    num = element.GetNumKids()
45    i = 0
46    while i < num:
47        # Check if the kid is a leaf node (i.e. it is a ContentItem)
48        if element.IsContentItem(i):
49            cont = element.GetAsContentItem(i)
50            type = cont.GetType()
51            
52            page = cont.GetPage()
53            
54            PrintIndent(indent)
55            sys.stdout.write("Content Item. Part of page #" + str(page.GetIndex()))
56            PrintIndent(indent)
57            if type == ContentItem.e_MCID:
58                sys.stdout.write("MCID: " + str(cont.GetMCID()))
59            elif type == ContentItem.e_MCR:
60                sys.stdout.write("MCID: " + str(cont.GetMCID()))
61            elif type == ContentItem.e_OBJR:
62                sys.stdout.write("OBJR ")
63                ref_obj = cont.GetRefObj()
64                if ref_obj != None:
65                    sys.stdout.write("- Referenced Object#: " + str(ref_obj.GetObjNum()))
66        else:
67            ProcessStructElement(element.GetAsStructElem(i), indent)
68        i = i + 1
69    
70
71# Used in code snippet 3.
72def ProcessElements2(reader, mcid_page_map):
73    element = reader.Next()
74    while element != None: # Read page contents
75        # In this sample we process only text, but the code can be extended
76        # to handle paths, images, or other Element type.
77        mcid = element.GetStructMCID()
78        
79        if mcid>=0 and element.GetType() == Element.e_text:
80            val = element.GetTextString()
81            
82            if mcid in mcid_page_map:
83                mcid_page_map[mcid] = str(mcid_page_map[mcid]) + val
84            else:
85                mcid_page_map[mcid] = val
86        element = reader.Next()
87
88# Used in code snippet 2.
89def ProcessElements(reader):
90    element = reader.Next()
91    while element != None:  # Read page contents
92        # In this sample we process only paths & text, but the code can be 
93        # extended to handle any element type.
94        type = element.GetType()
95        if (type == Element.e_path or
96            type == Element.e_text or
97            type == Element.e_path):
98            if type == Element.e_path:      # Process path ...
99                sys.stdout.write("\nPATH: ")
100            elif type == Element.e_text:    # Process text ...
101                sys.stdout.write("\nTEXT: " + element.GetTextString() + "\n")
102            elif type == Element.e_path:    # Process from XObjects
103                sys.stdout.write("\nFORM XObject: ")
104            
105            # Check if the element is associated with any structural element.
106            # Content items are leaf nodes of the structure tree.
107            struct_parent = element.GetParentStructElement()
108            if struct_parent.IsValid():
109                # Print out the parent structural element's type, title, and object number.
110                sys.stdout.write(" Type: " + str(struct_parent.GetType()) 
111                                 + ", MCID: " + str(element.GetStructMCID()))
112                if struct_parent.HasTitle():
113                    sys.stdout.write(". Title: " + struct_parent.GetTitle())
114                sys.stdout.write(", Obj#: " + str(struct_parent.GetSDFObj().GetObjNum()))
115        element = reader.Next()
116        
117        
118def ProcessStructElement2(element, mcid_doc_map, indent):
119    if not element.IsValid():
120        return
121    
122    # Print out the type and title info, if any
123    PrintIndent(indent)
124    sys.stdout.write("<" + element.GetType())
125    if element.HasTitle():
126        sys.stdout.write(" title=\"" + element.GetTitle() + "\"")
127    sys.stdout.write(">")
128    
129    num = element.GetNumKids()
130    i = 0
131    while i < num:
132        if element.IsContentItem(i):
133            cont = element.GetAsContentItem(i)
134            if cont.GetType() == ContentItem.e_MCID:
135                page_num = cont.GetPage().GetIndex()
136                if page_num in mcid_doc_map:
137                    mcid_page_map = mcid_doc_map[page_num]
138                    mcid_key = cont.GetMCID()
139                    if mcid_key in mcid_page_map:
140                        sys.stdout.write(mcid_page_map[mcid_key])
141        else: # the kid is another StructElement node.
142            ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)      
143        i = i + 1
144    PrintIndent(indent)
145    sys.stdout.write("</" + element.GetType() + ">")
146        
147
148def main():
149    PDFNet.Initialize(LicenseKey)
150    
151    # Relative path to the folder containing the test files.
152    input_path = "../../TestFiles/"
153    output_path = "../../TestFiles/Output/"
154    
155    # Extract logical structure from a PDF document
156    doc = PDFDoc(input_path + "tagged.pdf")
157    doc.InitSecurityHandler()
158    
159    print("____________________________________________________________")
160    print("Sample 1 - Traverse logical structure tree...")
161    
162    tree = doc.GetStructTree()
163    if tree.IsValid():
164        print("Document has a StructTree root.")
165        
166        i = 0
167        while i<tree.GetNumKids():
168            # Recursively get structure info for all child elements.
169            ProcessStructElement(tree.GetKid(i), 0)
170            i = i + 1
171    else:
172        print("This document does not contain any logical structure.")
173    
174    print("\nDone 1.")
175
176    print("____________________________________________________________")
177    print("Sample 2 - Get parent logical structure elements from")
178    print("layout elements.")
179    
180    reader = ElementReader()
181    itr = doc.GetPageIterator()
182    while itr.HasNext():
183        reader.Begin(itr.Current())
184        ProcessElements(reader)
185        reader.End()
186        itr.Next()
187    
188    print("\nDone 2.")
189    
190    print("____________________________________________________________")
191    print("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
192    # A map which maps page numbers(as Integers)
193    # to page Maps(which map from struct mcid(as Integers) to
194    # text Strings)
195    mcid_doc_map = dict()
196    reader = ElementReader()
197    itr = doc.GetPageIterator()
198    while itr.HasNext():
199        reader.Begin(itr.Current())
200        page_mcid_map = dict()
201        mcid_doc_map[itr.Current().GetIndex()] = page_mcid_map
202        ProcessElements2(reader, page_mcid_map)
203        reader.End()
204        itr.Next()  
205    tree = doc.GetStructTree()
206    if tree.IsValid():
207        i = 0
208        while i < tree.GetNumKids():
209            ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
210            i = i + 1  
211    print("\nDone 3.")
212    doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc.e_linearized)
213    doc.Close()        
214    PDFNet.Terminate()
215
216if __name__ == '__main__':
217    main()

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# This sample explores the structure and content of a tagged PDF document and dumps 
14# the structure information to the console window.
15#
16# In tagged PDF documents StructTree acts as a central repository for information 
17# related to a PDF document's logical structure. The tree consists of StructElement-s
18# and ContentItem-s which are leaf nodes of the structure tree.
19#
20# The sample can be extended to access and extract the marked-content elements such 
21# as text and images.
22#---------------------------------------------------------------------------------------
23
24def PrintIndent(indent)
25	print "\n"
26	i = 0
27	while i < indent
28		print "  "
29		i = i + 1
30	end
31end
32		
33def ProcessStructElement(element, indent)
34	if !element.IsValid
35		return
36	end
37	
38	# Print out the type and title info, if any.
39	PrintIndent(indent)
40	indent = indent + 1
41	print "Type: " + element.GetType
42	if element.HasTitle
43		print ". Title:" + element.GetTitle
44	end
45	
46	num = element.GetNumKids
47	i = 0
48	while i < num do
49		# Check if the kid is a leaf node (i.e. it is a ContentItem)
50		if element.IsContentItem(i)
51			cont = element.GetAsContentItem(i)
52			type = cont.GetType
53			
54			page = cont.GetPage
55			
56			PrintIndent(indent)
57			print "Content Item. Part of page #" + page.GetIndex.to_s
58			PrintIndent(indent)
59			case type
60			when ContentItem::E_MCID
61				print "MCID: " + cont.GetMCID.to_s
62			when ContentItem::E_MCR
63				print "MCID: " + cont.GetMCID.to_s
64			when ContentItem::E_OBJR
65				print "OBJR "
66				ref_obj = cont.GetRefObj
67				if !ref_obj.nil?
68					print "- Referenced Object#: " + ref_obj.GetObjNum.to_s
69				end
70			end
71		else
72			ProcessStructElement(element.GetAsStructElem(i), indent)
73		end
74		i = i + 1
75	end
76end	
77
78# Used in code snippet 3.
79def ProcessElements2(reader)
80	mcid_page_map = Hash.new
81	element = reader.Next
82	while !element.nil? do	# Read page contents
83		# In this sample we process only text, but the code can be extended
84		# to handle paths, images, or other Element type.
85		mcid = element.GetStructMCID
86		
87		if mcid>=0 and element.GetType == Element::E_text
88			val = element.GetTextString
89			
90			if mcid_page_map.has_key?(mcid)
91				mcid_page_map[mcid] = mcid_page_map[mcid].to_s + val
92			else
93				mcid_page_map[mcid] = val
94			end
95		end
96		element = reader.Next
97	end
98	return mcid_page_map
99end
100
101# Used in code snippet 2.
102def ProcessElements(reader)
103	element = reader.Next
104	while !element.nil? do	# Read page contents
105		# In this sample we process only paths & text, but the code can be 
106		# extended to handle any element type.
107		type = element.GetType
108		if (type == Element::E_path or
109			type == Element::E_text or
110			type == Element::E_path)
111			case type
112			when Element::E_path	# Process path ...
113				print "\nPATH: "
114			when Element::E_text	# Process text ...
115				print "\nTEXT: " + element.GetTextString + "\n"
116			when Element::E_path	# Process from XObjects
117				print "\nFORM XObject: "
118			end
119			
120			# Check if the element is associated with any structural element.
121			# Content items are leaf nodes of the structure tree.
122			struct_parent = element.GetParentStructElement
123			if struct_parent.IsValid
124				# Print out the parent structural element's type, title, and object number.
125				print " Type: " + struct_parent.GetType.to_s + ", MCID: " + element.GetStructMCID.to_s
126				if struct_parent.HasTitle
127					print ". Title: " + struct_parent.GetTitle
128				end
129				print ", Obj#: " + struct_parent.GetSDFObj.GetObjNum.to_s
130			end
131		end
132		element = reader.Next
133	end
134end		
135		
136def ProcessStructElement2(element, mcid_doc_map, indent)
137	if !element.IsValid
138		return
139	end
140	
141	# Print out the type and title info, if any
142	PrintIndent(indent)
143	print "<" + element.GetType
144	if element.HasTitle
145		print " title=\"" + element.GetTitle + "\""
146	end
147	print ">"
148	
149	num = element.GetNumKids
150	i = 0
151	while i < num do
152		if element.IsContentItem(i)
153			cont = element.GetAsContentItem(i)
154			if cont.GetType == ContentItem::E_MCID
155				page_num = cont.GetPage.GetIndex
156				if mcid_doc_map.has_key?(page_num)
157					mcid_page_map = mcid_doc_map[page_num]
158					mcid_key = cont.GetMCID
159					if mcid_page_map.has_key?(mcid_key)
160						print mcid_page_map[mcid_key]
161					end
162				end
163			end
164		else	# the kid is another StructElement node.
165			ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)
166		end 
167		i = i + 1
168	end
169	PrintIndent(indent)
170	print "</" + element.GetType + ">"		
171end
172
173	PDFNet.Initialize(PDFTronLicense.Key)
174	
175	# Relative path to the folder containing the test files.
176	input_path = "../../TestFiles/"
177	output_path = "../../TestFiles/Output/"
178	
179	# Extract logical structure from a PDF document
180	doc = PDFDoc.new(input_path + "tagged.pdf")
181	doc.InitSecurityHandler
182	
183	puts "____________________________________________________________"
184	puts "Sample 1 - Traverse logical structure tree..."
185	
186	tree = doc.GetStructTree
187	if tree.IsValid
188		puts "Document has a StructTree root."
189		
190		i = 0
191		while i<tree.GetNumKids do
192			# Recursively get structure info for all child elements.
193			ProcessStructElement(tree.GetKid(i), 0)
194			i = i + 1
195		end
196	else
197		puts "This document does not contain any logical structure."
198	end
199	
200	puts "\nDone 1."
201	
202	puts "____________________________________________________________"
203	puts "Sample 2 - Get parent logical structure elements from"
204	puts "layout elements."
205	
206	reader = ElementReader.new
207	itr = doc.GetPageIterator
208	while itr.HasNext do
209		reader.Begin(itr.Current)
210		ProcessElements(reader)
211		reader.End
212		itr.Next
213	end
214	
215	puts "\nDone 2."
216	
217	puts "____________________________________________________________"
218	puts "Sample 3 - 'XML style' extraction of PDF logical structure and page content."
219
220	# A map which maps page numbers(as Integers)
221	# to page Maps(which map from struct mcid(as Integers) to
222	# text Strings)
223
224	mcid_doc_map = Hash.new
225	reader = ElementReader.new
226	itr = doc.GetPageIterator
227	while itr.HasNext do
228		reader.Begin(itr.Current)
229		mcid_doc_map[itr.Current.GetIndex] = ProcessElements2(reader)
230		reader.End
231		itr.Next
232	end
233	tree = doc.GetStructTree
234	if tree.IsValid
235		i = 0
236		while i < tree.GetNumKids do
237			ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
238			i = i + 1  
239		end
240	end
241	puts "\nDone 3."
242	doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc::E_linearized)
243	doc.Close
244	PDFNet.Terminate

1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports System
6Imports System.Collections
7Imports pdftron
8Imports pdftron.Common
9Imports pdftron.Filters
10Imports pdftron.SDF
11Imports pdftron.PDF
12Imports pdftron.PDF.Struct
13
14Module LogicalStructureTestCS
15    Dim pdfNetLoader As PDFNetLoader
16    Sub New()
17        pdfNetLoader = pdftron.PDFNetLoader.Instance()
18    End Sub
19
20    Sub PrintIndent(ByVal indent As Integer)
21        Console.WriteLine()
22
23        For i As Integer = 0 To indent - 1
24            Console.Write("  ")
25        Next
26    End Sub
27
28    Sub ProcessStructElement(ByVal element As SElement, ByVal indent As Integer)
29        If Not element.IsValid() Then
30            Return
31        End If
32
33        PrintIndent(Math.Min(System.Threading.Interlocked.Increment(indent), indent - 1))
34        Console.Write("Type: " & element.[GetType]())
35
36        If element.HasTitle() Then
37            Console.Write(". Title: " & element.GetTitle())
38        End If
39
40        Dim num As Integer = element.GetNumKids()
41
42        For i As Integer = 0 To num - 1
43
44            If element.IsContentItem(i) Then
45                Dim cont As ContentItem = element.GetAsContentItem(i)
46                Dim type As ContentItem.Type = cont.[GetType]()
47                Dim page As Page = cont.GetPage()
48                PrintIndent(indent)
49                Console.Write("Content Item. Part of page #" & page.GetIndex())
50                PrintIndent(indent)
51
52                Select Case type
53                    Case ContentItem.Type.e_MCID, ContentItem.Type.e_MCR
54                        Console.Write("MCID: " & cont.GetMCID())
55                    Case ContentItem.Type.e_OBJR
56                        Console.Write("OBJR ")
57                        Dim ref_obj As Obj = cont.GetRefObj()
58                        If ref_obj IsNot Nothing Then Console.Write("- Referenced Object#: " & ref_obj.GetObjNum())
59                    Case Else
60                End Select
61            Else
62                ProcessStructElement(element.GetAsStructElem(i), indent)
63            End If
64        Next
65    End Sub
66
67    Sub ProcessElements(ByVal reader As ElementReader)
68        Dim element As Element = reader.Next()
69        While Not IsNothing(element)  ' Read page contents
70            Dim type As Element.Type = element.[GetType]()
71
72            If type = element.Type.e_path OrElse type = element.Type.e_text OrElse type = element.Type.e_path Then
73
74                Select Case type
75                    Case element.Type.e_path
76                        Console.WriteLine()
77                        Console.Write("PATH: ")
78                    Case element.Type.e_text
79                        Console.WriteLine()
80                        Console.WriteLine("TEXT: " & element.GetTextString())
81                    Case element.Type.e_form
82                        Console.WriteLine()
83                        Console.Write("FORM XObject: ")
84                End Select
85
86                Dim struct_parent As SElement = element.GetParentStructElement()
87
88                If struct_parent.IsValid() Then
89                    Console.Write(" Type: " & struct_parent.[GetType]() & ", MCID: " + String.Format("{0}", element.GetStructMCID()))
90
91                    If struct_parent.HasTitle() Then
92                        Console.Write(". Title: " & struct_parent.GetTitle())
93                    End If
94
95                    Console.Write(", Obj#: " & struct_parent.GetSDFObj().GetObjNum())
96                End If
97            End If
98            element = reader.Next()
99        End While
100    End Sub
101
102    Sub ProcessElements2(ByVal reader As ElementReader, ByVal mcid_page_map As Hashtable)
103        Dim element As Element = reader.Next()
104        While Not IsNothing(element)  ' Read page contents
105            Dim mcid As Integer = element.GetStructMCID()
106
107            If mcid >= 0 AndAlso element.[GetType]() = element.Type.e_text Then
108                Dim val As String = element.GetTextString()
109
110                If mcid_page_map.ContainsKey(mcid) Then
111                    mcid_page_map(mcid) = (CStr((mcid_page_map(mcid))) & val)
112                Else
113                    mcid_page_map.Add(mcid, val)
114                End If
115            End If
116            element = reader.Next()
117        End While
118    End Sub
119
120    Sub ProcessStructElement2(ByVal element As SElement, ByVal mcid_doc_map As Hashtable, ByVal indent As Integer)
121        If Not element.IsValid() Then
122            Return
123        End If
124
125        PrintIndent(indent)
126        Console.Write("<" & element.[GetType]())
127
128        If element.HasTitle() Then
129            Console.Write(" title=""" & element.GetTitle() & """")
130        End If
131
132        Console.Write(">")
133        Dim num As Integer = element.GetNumKids()
134
135        For i As Integer = 0 To num - 1
136
137            If element.IsContentItem(i) Then
138                Dim cont As ContentItem = element.GetAsContentItem(i)
139
140                If cont.[GetType]() = ContentItem.Type.e_MCID Then
141                    Dim page_num As Integer = cont.GetPage().GetIndex()
142
143                    If mcid_doc_map.ContainsKey(page_num) Then
144                        Dim mcid_page_map As Hashtable = CType((mcid_doc_map(page_num)), Hashtable)
145                        Dim mcid As Integer = cont.GetMCID()
146
147                        If mcid_page_map.ContainsKey(mcid) Then
148                            Console.Write(mcid_page_map(mcid))
149                        End If
150                    End If
151                End If
152            Else
153                ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent + 1)
154            End If
155        Next
156
157        PrintIndent(indent)
158        Console.Write("</" & element.[GetType]() & ">")
159    End Sub
160
161
162    Sub Main(ByVal args As String())
163        PDFNet.Initialize(PDFTronLicense.Key)
164        Dim input_path As String = "../../../../TestFiles/"
165        Dim output_path As String = "../../../../TestFiles/Output/"
166
167        Try
168
169            Using doc As PDFDoc = New PDFDoc(input_path & "tagged.pdf")
170                doc.InitSecurityHandler()
171                Dim example1 As Boolean = True
172                Dim example2 As Boolean = True
173                Dim example3 As Boolean = True
174
175                If example1 Then
176                    Console.WriteLine("____________________________________________________________")
177                    Console.WriteLine("Sample 1 - Traverse logical structure tree...")
178                    Dim tree As STree = doc.GetStructTree()
179
180                    If tree.IsValid() Then
181                        Console.WriteLine("Document has a StructTree root.")
182
183                        For i As Integer = 0 To tree.GetNumKids() - 1
184                            ProcessStructElement(tree.GetKid(i), 0)
185                        Next
186                    Else
187                        Console.WriteLine("This document does not contain any logical structure.")
188                    End If
189
190                    Console.WriteLine()
191                    Console.WriteLine("Done 1.")
192                End If
193
194                If example2 Then
195                    Console.WriteLine("____________________________________________________________")
196                    Console.WriteLine("Sample 2 - Get parent logical structure elements from")
197                    Console.WriteLine("layout elements.")
198                    Dim reader As ElementReader = New ElementReader()
199                    Dim itr As PageIterator = doc.GetPageIterator()
200
201                    While itr.HasNext()
202                        reader.Begin(itr.Current())
203                        ProcessElements(reader)
204                        reader.[End]()
205                        itr.[Next]()
206                    End While
207
208                    Console.WriteLine()
209                    Console.WriteLine("Done 2.")
210                End If
211
212                If example3 Then
213                    Console.WriteLine("____________________________________________________________")
214                    Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
215                    Dim mcid_doc_map As Hashtable = New Hashtable()
216                    Dim reader As ElementReader = New ElementReader()
217                    Dim itr As PageIterator = doc.GetPageIterator()
218
219                    While itr.HasNext()
220                        Dim pg As Page = itr.Current()
221                        reader.Begin(pg)
222                        Dim page_mcid_map As Hashtable = New Hashtable()
223                        mcid_doc_map.Add(pg.GetIndex(), page_mcid_map)
224                        ProcessElements2(reader, page_mcid_map)
225                        reader.[End]()
226                        itr.[Next]()
227                    End While
228
229                    Dim tree As STree = doc.GetStructTree()
230
231                    If tree.IsValid() Then
232
233                        For i As Integer = 0 To tree.GetNumKids() - 1
234                            ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
235                        Next
236                    End If
237
238                    Console.WriteLine()
239                    Console.WriteLine("Done 3.")
240                End If
241
242                doc.Save(output_path & "LogicalStructure.pdf", 0)
243            End Using
244
245        Catch e As PDFNetException
246            Console.WriteLine(e.Message)
247        End Try
248        PDFNet.Terminate()
249    End Sub
250
251End Module

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

PDF Logical Structure Reader - Python Sample Code