LogicalStructure

Sample C# code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Xamarin SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using System.Collections;
8
9using pdftron;
10using pdftron.Common;
11using pdftron.Filters;
12using pdftron.SDF;
13using pdftron.PDF;
14using pdftron.PDF.Struct;
15
16
17using NUnit.Framework;
18
19namespace MiscellaneousSamples
20{
21 //---------------------------------------------------------------------------------------
22 // This sample explores the structure and content of a tagged PDF document and dumps
23 // the structure information to the console window.
24 //
25 // In tagged PDF documents StructTree acts as a central repository for information
26 // related to a PDF document's logical structure. The tree consists of StructElement-s
27 // and ContentItem-s which are leaf nodes of the structure tree.
28 //
29 // The sample can be extended to access and extract the marked-content elements such
30 // as text and images.
31 //---------------------------------------------------------------------------------------
32 [TestFixture]
33 public class LogicalStructureTest
34 {
35 static void PrintIndent(int indent) { Console.WriteLine(); for (int i=0; i<indent; ++i) Console.Write(" "); }
36
37 // Used in code snippet 1.
38 static void ProcessStructElement(SElement element, int indent)
39 {
40 if (!element.IsValid()) {
41 return;
42 }
43
44 // Print out the type and title info, if any.
45 PrintIndent(indent++);
46 Console.Write("Type: " + element.GetType());
47 if (element.HasTitle()) {
48 Console.Write(". Title: "+ element.GetTitle());
49 }
50
51 int num = element.GetNumKids();
52 for (int i=0; i<num; ++i)
53 {
54 // Check is the kid is a leaf node (i.e. it is a ContentItem).
55 if (element.IsContentItem(i)) {
56 ContentItem cont = element.GetAsContentItem(i);
57 ContentItem.Type type = cont.GetType();
58
59 Page page = cont.GetPage();
60
61 PrintIndent(indent);
62 Console.Write("Content Item. Part of page #" + page.GetIndex());
63
64 PrintIndent(indent);
65 switch (type) {
66 case ContentItem.Type.e_MCID:
67 case ContentItem.Type.e_MCR:
68 Console.Write("MCID: " + cont.GetMCID());
69 break;
70 case ContentItem.Type.e_OBJR:
71 {
72 Console.Write("OBJR ");
73 Obj ref_obj = cont.GetRefObj();
74 if (ref_obj!=null)
75 Console.Write("- Referenced Object#: " + ref_obj.GetObjNum());
76 }
77 break;
78 default:
79 break;
80 }
81 }
82 else { // the kid is another StructElement node.
83 ProcessStructElement(element.GetAsStructElem(i), indent);
84 }
85 }
86 }
87
88 // Used in code snippet 2.
89 static void ProcessElements(ElementReader reader)
90 {
91 Element element;
92 while ((element = reader.Next())!=null) // Read page contents
93 {
94 // In this sample we process only paths & text, but the code can be
95 // extended to handle any element type.
96 Element.Type type = element.GetType();
97 if (type == Element.Type.e_path || type == Element.Type.e_text || type == Element.Type.e_path)
98 {
99 switch (type) {
100 case Element.Type.e_path: // Process path ...
101 Console.WriteLine();
102 Console.Write("PATH: ");
103 break;
104 case Element.Type.e_text: // Process text ...
105 Console.WriteLine();
106 Console.WriteLine("TEXT: " + element.GetTextString());
107 break;
108 case Element.Type.e_form: // Process form XObjects
109 Console.WriteLine();
110 Console.Write("FORM XObject: ");
111 //reader.FormBegin();
112 //ProcessElements(reader);
113 //reader.End();
114 break;
115 }
116
117 // Check if the element is associated with any structural element.
118 // Content items are leaf nodes of the structure tree.
119 SElement struct_parent = element.GetParentStructElement();
120 if (struct_parent.IsValid()) {
121 // Print out the parent structural element's type, title, and object number.
122 Console.Write(" Type: " + struct_parent.GetType()
123 + ", MCID: " + element.GetStructMCID());
124 if (struct_parent.HasTitle()) {
125 Console.Write(". Title: "+ struct_parent.GetTitle());
126 }
127 Console.Write(", Obj#: " + struct_parent.GetSDFObj().GetObjNum());
128 }
129 }
130 }
131 }
132
133 // Used in code snippet 3.
134 //typedef map<int, string> MCIDPageMap;
135 //typedef map<int, MCIDPageMap> MCIDDocMap;
136
137 // Used in code snippet 3.
138 static void ProcessElements2(ElementReader reader, Hashtable mcid_page_map)
139 {
140 Element element;
141 while ((element = reader.Next())!=null) // Read page contents
142 {
143 // In this sample we process only text, but the code can be extended
144 // to handle paths, images, or any other Element type.
145 int mcid = element.GetStructMCID();
146 if (mcid>= 0 && element.GetType() == Element.Type.e_text) {
147 String val = element.GetTextString();
148 if (mcid_page_map.ContainsKey(mcid)) mcid_page_map[mcid] = ((String)(mcid_page_map[mcid])+ val);
149 else mcid_page_map.Add(mcid, val);
150 }
151 }
152 }
153
154 // Used in code snippet 3.
155 static void ProcessStructElement2(SElement element, Hashtable mcid_doc_map, int indent)
156 {
157 if (!element.IsValid()) {
158 return;
159 }
160
161 // Print out the type and title info, if any.
162 PrintIndent(indent);
163 Console.Write("<" + element.GetType());
164 if (element.HasTitle()) {
165 Console.Write(" title=\""+ element.GetTitle() + "\"");
166 }
167 Console.Write(">");
168
169 int num = element.GetNumKids();
170 for (int i=0; i<num; ++i)
171 {
172 if (element.IsContentItem(i)) {
173 ContentItem cont = element.GetAsContentItem(i);
174 if (cont.GetType() == ContentItem.Type.e_MCID) {
175 int page_num = cont.GetPage().GetIndex();
176 if (mcid_doc_map.ContainsKey(page_num)) {
177 Hashtable mcid_page_map = (Hashtable)(mcid_doc_map[page_num]);
178 int mcid = cont.GetMCID();
179 if (mcid_page_map.ContainsKey(mcid)) {
180 Console.Write(mcid_page_map[mcid]);
181 }
182 }
183 }
184 }
185 else { // the kid is another StructElement node.
186 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1);
187 }
188 }
189
190 PrintIndent(indent);
191 Console.Write("</" + element.GetType() + ">");
192 }
193
194
195 /// <summary>
196 /// The main entry point for the application.
197 /// </summary>
198 [Test]
199 public static void Sample()
200 {
201 // Relative path to the folder containing test files.
202 const string input_path = "TestFiles/";
203
204 try // Extract logical structure from a PDF document
205 {
206 using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "tagged.pdf")))
207 {
208 doc.InitSecurityHandler();
209
210 bool example1 = true;
211 bool example2 = true;
212 bool example3 = true;
213
214 if (example1)
215 {
216 Console.WriteLine("____________________________________________________________");
217 Console.WriteLine("Sample 1 - Traverse logical structure tree...");
218
219 STree tree = doc.GetStructTree();
220 if (tree.IsValid())
221 {
222 Console.WriteLine("Document has a StructTree root.");
223 for (int i=0; i<tree.GetNumKids(); ++i)
224 {
225 // Recursively get structure info for all all child elements.
226 ProcessStructElement(tree.GetKid(i), 0);
227 }
228 }
229 else
230 {
231 Console.WriteLine("This document does not contain any logical structure.");
232 }
233
234 Console.WriteLine();
235 Console.WriteLine("Done 1.");
236 }
237
238 if (example2)
239 {
240 Console.WriteLine("____________________________________________________________");
241 Console.WriteLine("Sample 2 - Get parent logical structure elements from");
242 Console.WriteLine("layout elements.");
243
244 ElementReader reader=new ElementReader();
245 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
246 {
247 reader.Begin(itr.Current());
248 ProcessElements(reader);
249 reader.End();
250 }
251 Console.WriteLine();
252 Console.WriteLine("Done 2.");
253 }
254
255 if (example3)
256 {
257 Console.WriteLine("____________________________________________________________");
258 Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
259
260 //A map which maps page numbers(as Integers)
261 //to page Maps(which map from struct mcid(as Integers) to
262 //text Strings)
263 Hashtable mcid_doc_map=new Hashtable();
264 ElementReader reader=new ElementReader();
265 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
266 {
267 Page pg = itr.Current();
268 reader.Begin(pg);
269 Hashtable page_mcid_map=new Hashtable();
270 mcid_doc_map.Add(pg.GetIndex(), page_mcid_map);
271 ProcessElements2(reader, page_mcid_map);
272 reader.End();
273 }
274
275 STree tree = doc.GetStructTree();
276 if (tree.IsValid())
277 {
278 for (int i=0; i<tree.GetNumKids(); ++i)
279 {
280 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
281 }
282 }
283 Console.WriteLine();
284 Console.WriteLine("Done 3.");
285 }
286
287 doc.Save(Utils.CreateExternalFile("LogicalStructure.pdf"), 0);
288 }
289 }
290 catch (PDFNetException e)
291 {
292 Console.WriteLine(e.Message);
293 Assert.True(false);
294 }
295 }
296 }
297}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales