Sample C# code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree.
Learn more about our full PDF Data Extraction SDK Capabilities.
To start your free trial, get stated with Xamarin SDK.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using System.Collections;
8
9using pdftron;
10using pdftron.Common;
11using pdftron.Filters;
12using pdftron.SDF;
13using pdftron.PDF;
14using pdftron.PDF.Struct;
15
16
17using NUnit.Framework;
18
19namespace MiscellaneousSamples
20{
21	//---------------------------------------------------------------------------------------
22	// This sample explores the structure and content of a tagged PDF document and dumps 
23	// the structure information to the console window.
24	//
25	// In tagged PDF documents StructTree acts as a central repository for information 
26	// related to a PDF document's logical structure. The tree consists of StructElement-s
27	// and ContentItem-s which are leaf nodes of the structure tree.
28	//
29	// The sample can be extended to access and extract the marked-content elements such 
30	// as text and images.
31	//---------------------------------------------------------------------------------------
32	[TestFixture]
33	public class LogicalStructureTest
34	{
35		static void PrintIndent(int indent) { Console.WriteLine(); for (int i=0; i<indent; ++i) Console.Write("  "); }
36
37		// Used in code snippet 1.
38		static void ProcessStructElement(SElement element, int indent)
39		{
40			if (!element.IsValid()) {
41				return;
42			}
43
44			// Print out the type and title info, if any.
45			PrintIndent(indent++);
46			Console.Write("Type: " + element.GetType());
47			if (element.HasTitle()) {
48				Console.Write(". Title: "+ element.GetTitle());
49			}
50
51			int num = element.GetNumKids();
52			for (int i=0; i<num; ++i) 
53			{
54				// Check is the kid is a leaf node (i.e. it is a ContentItem).
55				if (element.IsContentItem(i)) { 
56					ContentItem cont = element.GetAsContentItem(i); 
57					ContentItem.Type type = cont.GetType();
58
59					Page page = cont.GetPage();
60
61					PrintIndent(indent);
62					Console.Write("Content Item. Part of page #" + page.GetIndex());
63
64					PrintIndent(indent);
65					switch (type) {
66						case ContentItem.Type.e_MCID:
67						case ContentItem.Type.e_MCR:
68							Console.Write("MCID: " + cont.GetMCID());
69							break;
70						case ContentItem.Type.e_OBJR:
71							{
72								Console.Write("OBJR ");
73								Obj ref_obj = cont.GetRefObj();
74								if (ref_obj!=null)
75									Console.Write("- Referenced Object#: " + ref_obj.GetObjNum());
76							}
77							break;
78						default: 
79							break;
80					}
81				}
82				else {  // the kid is another StructElement node.
83					ProcessStructElement(element.GetAsStructElem(i), indent);
84				}
85			}
86		}
87
88		// Used in code snippet 2.
89		static void ProcessElements(ElementReader reader)
90		{
91			Element element;
92			while ((element = reader.Next())!=null) 	// Read page contents
93			{
94				// In this sample we process only paths & text, but the code can be 
95				// extended to handle any element type.
96				Element.Type type = element.GetType();
97				if (type == Element.Type.e_path || type == Element.Type.e_text || type == Element.Type.e_path) 
98				{   
99					switch (type)	{
100					case Element.Type.e_path:               // Process path ...
101                        Console.WriteLine();
102						Console.Write("PATH: ");
103	 					break; 
104					case Element.Type.e_text: 				// Process text ...
105                        Console.WriteLine();
106                        Console.WriteLine("TEXT: " + element.GetTextString());
107						break;
108					case Element.Type.e_form:				// Process form XObjects
109                        Console.WriteLine();
110						Console.Write("FORM XObject: ");
111						//reader.FormBegin(); 
112						//ProcessElements(reader);
113						//reader.End(); 
114						break; 
115					}
116
117					// Check if the element is associated with any structural element.
118					// Content items are leaf nodes of the structure tree.
119					SElement struct_parent = element.GetParentStructElement();
120					if (struct_parent.IsValid()) {
121						// Print out the parent structural element's type, title, and object number.
122						Console.Write(" Type: " + struct_parent.GetType() 
123							+ ", MCID: " + element.GetStructMCID());
124						if (struct_parent.HasTitle()) {
125							Console.Write(". Title: "+ struct_parent.GetTitle());
126						}
127						Console.Write(", Obj#: " + struct_parent.GetSDFObj().GetObjNum());
128					}
129				}
130			}
131		}
132
133		// Used in code snippet 3.
134		//typedef map<int, string> MCIDPageMap;
135		//typedef map<int, MCIDPageMap> MCIDDocMap;
136
137		// Used in code snippet 3.
138		static void ProcessElements2(ElementReader reader, Hashtable mcid_page_map)
139		{
140			Element element;
141			while ((element = reader.Next())!=null) // Read page contents
142			{
143				// In this sample we process only text, but the code can be extended 
144				// to handle paths, images, or any other Element type.
145				int mcid = element.GetStructMCID();
146				if (mcid>= 0 && element.GetType() == Element.Type.e_text) {
147					String val = element.GetTextString();
148					if (mcid_page_map.ContainsKey(mcid)) mcid_page_map[mcid] = ((String)(mcid_page_map[mcid])+ val); 
149					else mcid_page_map.Add(mcid, val);
150				}
151			}
152		}
153
154		// Used in code snippet 3.
155		static void ProcessStructElement2(SElement element, Hashtable mcid_doc_map, int indent)
156		{
157			if (!element.IsValid()) {
158				return;
159			}
160
161			// Print out the type and title info, if any.
162			PrintIndent(indent);
163			Console.Write("<" + element.GetType());
164			if (element.HasTitle()) {
165				Console.Write(" title=\""+ element.GetTitle() + "\"");
166			}
167			Console.Write(">");
168
169			int num = element.GetNumKids();
170			for (int i=0; i<num; ++i) 
171			{		
172				if (element.IsContentItem(i)) { 
173					ContentItem cont = element.GetAsContentItem(i); 
174					if (cont.GetType() == ContentItem.Type.e_MCID) {
175						int page_num = cont.GetPage().GetIndex();
176						if (mcid_doc_map.ContainsKey(page_num)) {
177							Hashtable mcid_page_map = (Hashtable)(mcid_doc_map[page_num]);
178							int mcid = cont.GetMCID();
179							if (mcid_page_map.ContainsKey(mcid)) {
180								Console.Write(mcid_page_map[mcid]); 
181							}                    
182						}
183					}
184				}
185				else {  // the kid is another StructElement node.
186					ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1);
187				}
188			}
189
190			PrintIndent(indent);
191			Console.Write("</" + element.GetType() + ">");
192		}
193
194
195		/// <summary>
196		/// The main entry point for the application.
197		/// </summary>
198		[Test]
199		public static void Sample()
200		{
201			// Relative path to the folder containing test files.
202			const string input_path =  "TestFiles/";
203
204			try  // Extract logical structure from a PDF document
205			{
206				using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "tagged.pdf")))
207				{
208					doc.InitSecurityHandler();
209
210					bool example1 = true;
211					bool example2 = true;
212					bool example3 = true;
213
214					if (example1)
215					{
216						Console.WriteLine("____________________________________________________________");
217						Console.WriteLine("Sample 1 - Traverse logical structure tree...");
218
219						STree tree = doc.GetStructTree();
220						if (tree.IsValid()) 
221						{
222							Console.WriteLine("Document has a StructTree root.");
223							for (int i=0; i<tree.GetNumKids(); ++i) 
224							{
225								// Recursively get structure  info for all all child elements.
226								ProcessStructElement(tree.GetKid(i), 0);
227							}
228						}
229						else 
230						{
231							Console.WriteLine("This document does not contain any logical structure.");
232						}
233
234                        Console.WriteLine();
235                        Console.WriteLine("Done 1.");
236					}
237
238					if (example2)
239					{
240						Console.WriteLine("____________________________________________________________");
241						Console.WriteLine("Sample 2 - Get parent logical structure elements from");
242						Console.WriteLine("layout elements.");
243						
244						ElementReader reader=new ElementReader();
245						for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) 
246						{				
247							reader.Begin(itr.Current());
248							ProcessElements(reader);
249							reader.End();
250						}
251                        Console.WriteLine();
252						Console.WriteLine("Done 2.");
253					}
254
255					if (example3)
256					{
257						Console.WriteLine("____________________________________________________________");
258						Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
259						
260						//A map which maps page numbers(as Integers)
261						//to page Maps(which map from struct mcid(as Integers) to
262						//text Strings)
263						Hashtable mcid_doc_map=new Hashtable();
264						ElementReader reader=new ElementReader();
265						for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) 
266						{				
267							Page pg = itr.Current();
268							reader.Begin(pg);
269							Hashtable page_mcid_map=new Hashtable();
270							mcid_doc_map.Add(pg.GetIndex(), page_mcid_map);
271							ProcessElements2(reader, page_mcid_map);
272							reader.End();
273						}
274						
275						STree tree = doc.GetStructTree();
276						if (tree.IsValid()) 
277						{
278							for (int i=0; i<tree.GetNumKids(); ++i) 
279							{
280								ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
281							}
282						}
283                        Console.WriteLine();
284                        Console.WriteLine("Done 3.");
285					}
286
287					doc.Save(Utils.CreateExternalFile("LogicalStructure.pdf"), 0);
288				}
289			}
290			catch (PDFNetException e)
291			{
292				Console.WriteLine(e.Message);
293				Assert.True(false);
294			}
295		}
296	}
297}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales