LogicalStructure

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using System.Collections;
8
9using pdftron;
10using pdftron.Common;
11using pdftron.Filters;
12using pdftron.SDF;
13using pdftron.PDF;
14using pdftron.PDF.Struct;
15
16
17namespace LogicalStructureTestCS
18{
19 //---------------------------------------------------------------------------------------
20 // This sample explores the structure and content of a tagged PDF document and dumps
21 // the structure information to the console window.
22 //
23 // In tagged PDF documents StructTree acts as a central repository for information
24 // related to a PDF document's logical structure. The tree consists of StructElement-s
25 // and ContentItem-s which are leaf nodes of the structure tree.
26 //
27 // The sample can be extended to access and extract the marked-content elements such
28 // as text and images.
29 //---------------------------------------------------------------------------------------
30 class Class1
31 {
32 static void PrintIndent(int indent) { Console.WriteLine(); for (int i=0; i<indent; ++i) Console.Write(" "); }
33
34 // Used in code snippet 1.
35 static void ProcessStructElement(SElement element, int indent)
36 {
37 if (!element.IsValid()) {
38 return;
39 }
40
41 // Print out the type and title info, if any.
42 PrintIndent(indent++);
43 Console.Write("Type: " + element.GetType());
44 if (element.HasTitle()) {
45 Console.Write(". Title: "+ element.GetTitle());
46 }
47
48 int num = element.GetNumKids();
49 for (int i=0; i<num; ++i)
50 {
51 // Check is the kid is a leaf node (i.e. it is a ContentItem).
52 if (element.IsContentItem(i)) {
53 ContentItem cont = element.GetAsContentItem(i);
54 ContentItem.Type type = cont.GetType();
55
56 Page page = cont.GetPage();
57
58 PrintIndent(indent);
59 Console.Write("Content Item. Part of page #" + page.GetIndex());
60
61 PrintIndent(indent);
62 switch (type) {
63 case ContentItem.Type.e_MCID:
64 case ContentItem.Type.e_MCR:
65 Console.Write("MCID: " + cont.GetMCID());
66 break;
67 case ContentItem.Type.e_OBJR:
68 {
69 Console.Write("OBJR ");
70 Obj ref_obj = cont.GetRefObj();
71 if (ref_obj!=null)
72 Console.Write("- Referenced Object#: " + ref_obj.GetObjNum());
73 }
74 break;
75 default:
76 break;
77 }
78 }
79 else { // the kid is another StructElement node.
80 ProcessStructElement(element.GetAsStructElem(i), indent);
81 }
82 }
83 }
84
85 // Used in code snippet 2.
86 static void ProcessElements(ElementReader reader)
87 {
88 Element element;
89 while ((element = reader.Next())!=null) // Read page contents
90 {
91 // In this sample we process only paths & text, but the code can be
92 // extended to handle any element type.
93 Element.Type type = element.GetType();
94 if (type == Element.Type.e_path || type == Element.Type.e_text || type == Element.Type.e_path)
95 {
96 switch (type) {
97 case Element.Type.e_path: // Process path ...
98 Console.WriteLine();
99 Console.Write("PATH: ");
100 break;
101 case Element.Type.e_text: // Process text ...
102 Console.WriteLine();
103 Console.WriteLine("TEXT: " + element.GetTextString());
104 break;
105 case Element.Type.e_form: // Process form XObjects
106 Console.WriteLine();
107 Console.Write("FORM XObject: ");
108 //reader.FormBegin();
109 //ProcessElements(reader);
110 //reader.End();
111 break;
112 }
113
114 // Check if the element is associated with any structural element.
115 // Content items are leaf nodes of the structure tree.
116 SElement struct_parent = element.GetParentStructElement();
117 if (struct_parent.IsValid()) {
118 // Print out the parent structural element's type, title, and object number.
119 Console.Write(" Type: " + struct_parent.GetType()
120 + ", MCID: " + element.GetStructMCID());
121 if (struct_parent.HasTitle()) {
122 Console.Write(". Title: "+ struct_parent.GetTitle());
123 }
124 Console.Write(", Obj#: " + struct_parent.GetSDFObj().GetObjNum());
125 }
126 }
127 }
128 }
129
130 // Used in code snippet 3.
131 //typedef map<int, string> MCIDPageMap;
132 //typedef map<int, MCIDPageMap> MCIDDocMap;
133
134 // Used in code snippet 3.
135 static void ProcessElements2(ElementReader reader, Hashtable mcid_page_map)
136 {
137 Element element;
138 while ((element = reader.Next())!=null) // Read page contents
139 {
140 // In this sample we process only text, but the code can be extended
141 // to handle paths, images, or any other Element type.
142 int mcid = element.GetStructMCID();
143 if (mcid>= 0 && element.GetType() == Element.Type.e_text) {
144 String val = element.GetTextString();
145 if (mcid_page_map.ContainsKey(mcid)) mcid_page_map[mcid] = ((String)(mcid_page_map[mcid])+ val);
146 else mcid_page_map.Add(mcid, val);
147 }
148 }
149 }
150
151 // Used in code snippet 3.
152 static void ProcessStructElement2(SElement element, Hashtable mcid_doc_map, int indent)
153 {
154 if (!element.IsValid()) {
155 return;
156 }
157
158 // Print out the type and title info, if any.
159 PrintIndent(indent);
160 Console.Write("<" + element.GetType());
161 if (element.HasTitle()) {
162 Console.Write(" title=\""+ element.GetTitle() + "\"");
163 }
164 Console.Write(">");
165
166 int num = element.GetNumKids();
167 for (int i=0; i<num; ++i)
168 {
169 if (element.IsContentItem(i)) {
170 ContentItem cont = element.GetAsContentItem(i);
171 if (cont.GetType() == ContentItem.Type.e_MCID) {
172 int page_num = cont.GetPage().GetIndex();
173 if (mcid_doc_map.ContainsKey(page_num)) {
174 Hashtable mcid_page_map = (Hashtable)(mcid_doc_map[page_num]);
175 int mcid = cont.GetMCID();
176 if (mcid_page_map.ContainsKey(mcid)) {
177 Console.Write(mcid_page_map[mcid]);
178 }
179 }
180 }
181 }
182 else { // the kid is another StructElement node.
183 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1);
184 }
185 }
186
187 PrintIndent(indent);
188 Console.Write("</" + element.GetType() + ">");
189 }
190
191 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
192 static Class1() {}
193
194 /// <summary>
195 /// The main entry point for the application.
196 /// </summary>
197 static void Main(string[] args)
198 {
199 PDFNet.Initialize(PDFTronLicense.Key);
200 // Relative path to the folder containing test files.
201 string input_path = "../../../../TestFiles/";
202 string output_path = "../../../../TestFiles/Output/";
203
204 try // Extract logical structure from a PDF document
205 {
206 using (PDFDoc doc = new PDFDoc(input_path + "tagged.pdf"))
207 {
208 doc.InitSecurityHandler();
209
210 bool example1 = true;
211 bool example2 = true;
212 bool example3 = true;
213
214 if (example1)
215 {
216 Console.WriteLine("____________________________________________________________");
217 Console.WriteLine("Sample 1 - Traverse logical structure tree...");
218
219 STree tree = doc.GetStructTree();
220 if (tree.IsValid())
221 {
222 Console.WriteLine("Document has a StructTree root.");
223 for (int i=0; i<tree.GetNumKids(); ++i)
224 {
225 // Recursively get structure info for all all child elements.
226 ProcessStructElement(tree.GetKid(i), 0);
227 }
228 }
229 else
230 {
231 Console.WriteLine("This document does not contain any logical structure.");
232 }
233
234 Console.WriteLine();
235 Console.WriteLine("Done 1.");
236 }
237
238 if (example2)
239 {
240 Console.WriteLine("____________________________________________________________");
241 Console.WriteLine("Sample 2 - Get parent logical structure elements from");
242 Console.WriteLine("layout elements.");
243
244 ElementReader reader=new ElementReader();
245 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
246 {
247 reader.Begin(itr.Current());
248 ProcessElements(reader);
249 reader.End();
250 }
251 Console.WriteLine();
252 Console.WriteLine("Done 2.");
253 }
254
255 if (example3)
256 {
257 Console.WriteLine("____________________________________________________________");
258 Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
259
260 //A map which maps page numbers(as Integers)
261 //to page Maps(which map from struct mcid(as Integers) to
262 //text Strings)
263 Hashtable mcid_doc_map=new Hashtable();
264 ElementReader reader=new ElementReader();
265 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
266 {
267 Page pg = itr.Current();
268 reader.Begin(pg);
269 Hashtable page_mcid_map=new Hashtable();
270 mcid_doc_map.Add(pg.GetIndex(), page_mcid_map);
271 ProcessElements2(reader, page_mcid_map);
272 reader.End();
273 }
274
275 STree tree = doc.GetStructTree();
276 if (tree.IsValid())
277 {
278 for (int i=0; i<tree.GetNumKids(); ++i)
279 {
280 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
281 }
282 }
283 Console.WriteLine();
284 Console.WriteLine("Done 3.");
285 }
286
287 doc.Save(output_path + "LogicalStructure.pdf", 0);
288 }
289 }
290 catch (PDFNetException e)
291 {
292 Console.WriteLine(e.Message);
293 }
294 PDFNet.Terminate();
295 }
296 }
297}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales