Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using System.Collections;
8
9using pdftron;
10using pdftron.Common;
11using pdftron.Filters;
12using pdftron.SDF;
13using pdftron.PDF;
14using pdftron.PDF.Struct;
15
16
17namespace LogicalStructureTestCS
18{
19 //---------------------------------------------------------------------------------------
20 // This sample explores the structure and content of a tagged PDF document and dumps
21 // the structure information to the console window.
22 //
23 // In tagged PDF documents StructTree acts as a central repository for information
24 // related to a PDF document's logical structure. The tree consists of StructElement-s
25 // and ContentItem-s which are leaf nodes of the structure tree.
26 //
27 // The sample can be extended to access and extract the marked-content elements such
28 // as text and images.
29 //---------------------------------------------------------------------------------------
30 class Class1
31 {
32 static void PrintIndent(int indent) { Console.WriteLine(); for (int i=0; i<indent; ++i) Console.Write(" "); }
33
34 // Used in code snippet 1.
35 static void ProcessStructElement(SElement element, int indent)
36 {
37 if (!element.IsValid()) {
38 return;
39 }
40
41 // Print out the type and title info, if any.
42 PrintIndent(indent++);
43 Console.Write("Type: " + element.GetType());
44 if (element.HasTitle()) {
45 Console.Write(". Title: "+ element.GetTitle());
46 }
47
48 int num = element.GetNumKids();
49 for (int i=0; i<num; ++i)
50 {
51 // Check is the kid is a leaf node (i.e. it is a ContentItem).
52 if (element.IsContentItem(i)) {
53 ContentItem cont = element.GetAsContentItem(i);
54 ContentItem.Type type = cont.GetType();
55
56 Page page = cont.GetPage();
57
58 PrintIndent(indent);
59 Console.Write("Content Item. Part of page #" + page.GetIndex());
60
61 PrintIndent(indent);
62 switch (type) {
63 case ContentItem.Type.e_MCID:
64 case ContentItem.Type.e_MCR:
65 Console.Write("MCID: " + cont.GetMCID());
66 break;
67 case ContentItem.Type.e_OBJR:
68 {
69 Console.Write("OBJR ");
70 Obj ref_obj = cont.GetRefObj();
71 if (ref_obj!=null)
72 Console.Write("- Referenced Object#: " + ref_obj.GetObjNum());
73 }
74 break;
75 default:
76 break;
77 }
78 }
79 else { // the kid is another StructElement node.
80 ProcessStructElement(element.GetAsStructElem(i), indent);
81 }
82 }
83 }
84
85 // Used in code snippet 2.
86 static void ProcessElements(ElementReader reader)
87 {
88 Element element;
89 while ((element = reader.Next())!=null) // Read page contents
90 {
91 // In this sample we process only paths & text, but the code can be
92 // extended to handle any element type.
93 Element.Type type = element.GetType();
94 if (type == Element.Type.e_path || type == Element.Type.e_text || type == Element.Type.e_path)
95 {
96 switch (type) {
97 case Element.Type.e_path: // Process path ...
98 Console.WriteLine();
99 Console.Write("PATH: ");
100 break;
101 case Element.Type.e_text: // Process text ...
102 Console.WriteLine();
103 Console.WriteLine("TEXT: " + element.GetTextString());
104 break;
105 case Element.Type.e_form: // Process form XObjects
106 Console.WriteLine();
107 Console.Write("FORM XObject: ");
108 //reader.FormBegin();
109 //ProcessElements(reader);
110 //reader.End();
111 break;
112 }
113
114 // Check if the element is associated with any structural element.
115 // Content items are leaf nodes of the structure tree.
116 SElement struct_parent = element.GetParentStructElement();
117 if (struct_parent.IsValid()) {
118 // Print out the parent structural element's type, title, and object number.
119 Console.Write(" Type: " + struct_parent.GetType()
120 + ", MCID: " + element.GetStructMCID());
121 if (struct_parent.HasTitle()) {
122 Console.Write(". Title: "+ struct_parent.GetTitle());
123 }
124 Console.Write(", Obj#: " + struct_parent.GetSDFObj().GetObjNum());
125 }
126 }
127 }
128 }
129
130 // Used in code snippet 3.
131 //typedef map<int, string> MCIDPageMap;
132 //typedef map<int, MCIDPageMap> MCIDDocMap;
133
134 // Used in code snippet 3.
135 static void ProcessElements2(ElementReader reader, Hashtable mcid_page_map)
136 {
137 Element element;
138 while ((element = reader.Next())!=null) // Read page contents
139 {
140 // In this sample we process only text, but the code can be extended
141 // to handle paths, images, or any other Element type.
142 int mcid = element.GetStructMCID();
143 if (mcid>= 0 && element.GetType() == Element.Type.e_text) {
144 String val = element.GetTextString();
145 if (mcid_page_map.ContainsKey(mcid)) mcid_page_map[mcid] = ((String)(mcid_page_map[mcid])+ val);
146 else mcid_page_map.Add(mcid, val);
147 }
148 }
149 }
150
151 // Used in code snippet 3.
152 static void ProcessStructElement2(SElement element, Hashtable mcid_doc_map, int indent)
153 {
154 if (!element.IsValid()) {
155 return;
156 }
157
158 // Print out the type and title info, if any.
159 PrintIndent(indent);
160 Console.Write("<" + element.GetType());
161 if (element.HasTitle()) {
162 Console.Write(" title=\""+ element.GetTitle() + "\"");
163 }
164 Console.Write(">");
165
166 int num = element.GetNumKids();
167 for (int i=0; i<num; ++i)
168 {
169 if (element.IsContentItem(i)) {
170 ContentItem cont = element.GetAsContentItem(i);
171 if (cont.GetType() == ContentItem.Type.e_MCID) {
172 int page_num = cont.GetPage().GetIndex();
173 if (mcid_doc_map.ContainsKey(page_num)) {
174 Hashtable mcid_page_map = (Hashtable)(mcid_doc_map[page_num]);
175 int mcid = cont.GetMCID();
176 if (mcid_page_map.ContainsKey(mcid)) {
177 Console.Write(mcid_page_map[mcid]);
178 }
179 }
180 }
181 }
182 else { // the kid is another StructElement node.
183 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1);
184 }
185 }
186
187 PrintIndent(indent);
188 Console.Write("</" + element.GetType() + ">");
189 }
190
191 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
192 static Class1() {}
193
194 /// <summary>
195 /// The main entry point for the application.
196 /// </summary>
197 static void Main(string[] args)
198 {
199 PDFNet.Initialize(PDFTronLicense.Key);
200 // Relative path to the folder containing test files.
201 string input_path = "../../../../TestFiles/";
202 string output_path = "../../../../TestFiles/Output/";
203
204 try // Extract logical structure from a PDF document
205 {
206 using (PDFDoc doc = new PDFDoc(input_path + "tagged.pdf"))
207 {
208 doc.InitSecurityHandler();
209
210 bool example1 = true;
211 bool example2 = true;
212 bool example3 = true;
213
214 if (example1)
215 {
216 Console.WriteLine("____________________________________________________________");
217 Console.WriteLine("Sample 1 - Traverse logical structure tree...");
218
219 STree tree = doc.GetStructTree();
220 if (tree.IsValid())
221 {
222 Console.WriteLine("Document has a StructTree root.");
223 for (int i=0; i<tree.GetNumKids(); ++i)
224 {
225 // Recursively get structure info for all all child elements.
226 ProcessStructElement(tree.GetKid(i), 0);
227 }
228 }
229 else
230 {
231 Console.WriteLine("This document does not contain any logical structure.");
232 }
233
234 Console.WriteLine();
235 Console.WriteLine("Done 1.");
236 }
237
238 if (example2)
239 {
240 Console.WriteLine("____________________________________________________________");
241 Console.WriteLine("Sample 2 - Get parent logical structure elements from");
242 Console.WriteLine("layout elements.");
243
244 ElementReader reader=new ElementReader();
245 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
246 {
247 reader.Begin(itr.Current());
248 ProcessElements(reader);
249 reader.End();
250 }
251 Console.WriteLine();
252 Console.WriteLine("Done 2.");
253 }
254
255 if (example3)
256 {
257 Console.WriteLine("____________________________________________________________");
258 Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
259
260 //A map which maps page numbers(as Integers)
261 //to page Maps(which map from struct mcid(as Integers) to
262 //text Strings)
263 Hashtable mcid_doc_map=new Hashtable();
264 ElementReader reader=new ElementReader();
265 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())
266 {
267 Page pg = itr.Current();
268 reader.Begin(pg);
269 Hashtable page_mcid_map=new Hashtable();
270 mcid_doc_map.Add(pg.GetIndex(), page_mcid_map);
271 ProcessElements2(reader, page_mcid_map);
272 reader.End();
273 }
274
275 STree tree = doc.GetStructTree();
276 if (tree.IsValid())
277 {
278 for (int i=0; i<tree.GetNumKids(); ++i)
279 {
280 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
281 }
282 }
283 Console.WriteLine();
284 Console.WriteLine("Done 3.");
285 }
286
287 doc.Save(output_path + "LogicalStructure.pdf", 0);
288 }
289 }
290 catch (PDFNetException e)
291 {
292 Console.WriteLine(e.Message);
293 }
294 PDFNet.Terminate();
295 }
296 }
297}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <iostream>
10#include <map>
11#include "../../LicenseKey/CPP/LicenseKey.h"
12
13using namespace pdftron;
14using namespace PDF;
15using namespace std;
16
17//---------------------------------------------------------------------------------------
18// This sample explores the structure and content of a tagged PDF document and dumps
19// the structure information to the console window.
20//
21// In tagged PDF documents StructTree acts as a central repository for information
22// related to a PDF document's logical structure. The tree consists of StructElement-s
23// and ContentItem-s which are leaf nodes of the structure tree.
24//
25// The sample can be extended to access and extract the marked-content elements such
26// as text and images.
27//---------------------------------------------------------------------------------------
28
29
30void PrintIndent(int indent) { cout << '\n'; for (int i=0; i<indent; ++i) cout << " "; }
31
32// Used in code snippet 1.
33void ProcessStructElement(Struct::SElement element, int ident)
34{
35 if (!element.IsValid()) {
36 return;
37 }
38
39 // Print out the type and title info, if any.
40 PrintIndent(ident++);
41 cout << "Type: "<< element.GetType();
42 if (element.HasTitle()) {
43 cout << ". Title: "<< element.GetTitle();
44 }
45
46 int num = element.GetNumKids();
47 for (int i=0; i<num; ++i)
48 {
49 // Check is the kid is a leaf node (i.e. it is a ContentItem).
50 if (element.IsContentItem(i)) {
51 Struct::ContentItem cont = element.GetAsContentItem(i);
52 Struct::ContentItem::Type type = cont.GetType();
53
54 Page page = cont.GetPage();
55
56 PrintIndent(ident);
57 cout << "Content Item. Part of page #" << page.GetIndex();
58
59 PrintIndent(ident);
60 switch (type) {
61 case Struct::ContentItem::e_MCID:
62 case Struct::ContentItem::e_MCR:
63 cout << "MCID: " << cont.GetMCID();
64 break;
65 case Struct::ContentItem::e_OBJR:
66 {
67 cout << "OBJR ";
68 if (SDF::Obj ref_obj = cont.GetRefObj())
69 cout << "- Referenced Object#: " << ref_obj.GetObjNum();
70 }
71 break;
72 default:
73 break;
74 }
75 }
76 else { // the kid is another StructElement node.
77 ProcessStructElement(element.GetAsStructElem(i), ident);
78 }
79 }
80}
81
82// Used in code snippet 2.
83void ProcessElements(ElementReader& reader)
84{
85 Element element;
86 while (element = reader.Next()) // Read page contents
87 {
88 // In this sample we process only paths & text, but the code can be
89 // extended to handle any element type.
90 Element::Type type = element.GetType();
91 if (type == Element::e_path || type == Element::e_text || type == Element::e_path)
92 {
93 switch (type) {
94 case Element::e_path: // Process path ...
95 cout << "\nPATH: ";
96 break;
97 case Element::e_text: // Process text ...
98 cout << "\nTEXT: " << element.GetTextString() << endl;
99 break;
100 case Element::e_form: // Process form XObjects
101 cout << "\nFORM XObject: ";
102 //reader.FormBegin();
103 //ProcessElements(reader);
104 //reader.End();
105 break;
106 }
107
108 // Check if the element is associated with any structural element.
109 // Content items are leaf nodes of the structure tree.
110 Struct::SElement struct_parent = element.GetParentStructElement();
111 if (struct_parent.IsValid()) {
112 // Print out the parent structural element's type, title, and object number.
113 cout << " Type: " << struct_parent.GetType()
114 << ", MCID: " << element.GetStructMCID();
115 if (struct_parent.HasTitle()) {
116 cout << ". Title: "<< struct_parent.GetTitle();
117 }
118 cout << ", Obj#: " << struct_parent.GetSDFObj().GetObjNum();
119 }
120 }
121 }
122}
123
124// Used in code snippet 3.
125typedef map<int, string> MCIDPageMap;
126typedef map<int, MCIDPageMap> MCIDDocMap;
127
128// Used in code snippet 3.
129void ProcessElements2(ElementReader& reader, MCIDPageMap& mcid_page_map)
130{
131 Element element;
132 while (element = reader.Next()) // Read page contents
133 {
134 // In this sample we process only text, but the code can be extended
135 // to handle paths, images, or any other Element type.
136 int mcid = element.GetStructMCID();
137 if (mcid>= 0 && element.GetType() == Element::e_text) {
138 string val = element.GetTextString().ConvertToAscii();
139 MCIDPageMap::iterator itr = mcid_page_map.find(mcid);
140 if (itr != mcid_page_map.end()) itr->second += val;
141 else mcid_page_map.insert(MCIDPageMap::value_type(mcid, val));
142 }
143 }
144}
145
146// Used in code snippet 3.
147void ProcessStructElement2(Struct::SElement element, MCIDDocMap& mcid_doc_map, int ident)
148{
149 if (!element.IsValid()) {
150 return;
151 }
152
153 // Print out the type and title info, if any.
154 PrintIndent(ident);
155 cout << "<" << element.GetType();
156 if (element.HasTitle()) {
157 cout << " title=\""<< element.GetTitle() << "\"";
158 }
159 cout << ">";
160
161 int num = element.GetNumKids();
162 for (int i=0; i<num; ++i)
163 {
164 if (element.IsContentItem(i)) {
165 Struct::ContentItem cont = element.GetAsContentItem(i);
166 if (cont.GetType() == Struct::ContentItem::e_MCID) {
167 int page_num = cont.GetPage().GetIndex();
168 MCIDDocMap::iterator itr = mcid_doc_map.find(page_num);
169 if (itr!=mcid_doc_map.end()) {
170 MCIDPageMap& mcid_page_map = itr->second;
171 MCIDPageMap::iterator itr2 = mcid_page_map.find(cont.GetMCID());
172 if (itr2 != mcid_page_map.end()) {
173 cout << itr2->second;
174 }
175 }
176 }
177 }
178 else { // the kid is another StructElement node.
179 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, ident+1);
180 }
181 }
182
183 PrintIndent(ident);
184 cout << "</" << element.GetType() << ">";
185}
186
187
188int main(int argc, char *argv[])
189{
190 int ret = 0;
191 PDFNet::Initialize(LicenseKey);
192
193 // Relative path to the folder containing test files.
194 string input_path = "../../TestFiles/";
195 string output_path = "../../TestFiles/Output/";
196
197 try // Extract logical structure from a PDF document
198 {
199 PDFDoc doc((input_path + "tagged.pdf").c_str());
200 doc.InitSecurityHandler();
201
202 cout << "____________________________________________________________" << endl;
203 cout << "Sample 1 - Traverse logical structure tree..." << endl;
204 {
205 Struct::STree tree = doc.GetStructTree();
206 if (tree.IsValid()) {
207 cout << "Document has a StructTree root." << endl;
208
209 for (int i=0; i<tree.GetNumKids(); ++i) {
210 // Recursively get structure info for all child elements.
211 ProcessStructElement(tree.GetKid(i), 0);
212 }
213 }
214 else {
215 cout << "This document does not contain any logical structure." << endl;
216 }
217 }
218 cout << "\nDone 1." << endl;
219
220 cout << "____________________________________________________________" << endl;
221 cout << "Sample 2 - Get parent logical structure elements from" << endl;
222 cout << "layout elements." << endl;
223 {
224 ElementReader reader;
225 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) {
226 reader.Begin(itr.Current());
227 ProcessElements(reader);
228 reader.End();
229 }
230 }
231 cout << "\nDone 2." << endl;
232
233 cout << "____________________________________________________________" << endl;
234 cout << "Sample 3 - 'XML style' extraction of PDF logical structure and page content." << endl;
235 {
236 MCIDDocMap mcid_doc_map;
237 ElementReader reader;
238 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) {
239 reader.Begin(itr.Current());
240 pair<MCIDDocMap::iterator, bool> r = mcid_doc_map.insert(MCIDDocMap::value_type(itr.Current().GetIndex(), MCIDPageMap()));
241 MCIDPageMap& page_mcid_map = (r.first)->second;
242 ProcessElements2(reader, page_mcid_map);
243 reader.End();
244 }
245
246 Struct::STree tree = doc.GetStructTree();
247 if (tree.IsValid()) {
248 for (int i=0; i<tree.GetNumKids(); ++i) {
249 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
250 }
251 }
252 }
253 cout << "\nDone 3." << endl;
254
255 doc.Save(output_path + "LogicalStructure.pdf", 0);
256 }
257 catch(Common::Exception& e)
258 {
259 cout << e << endl;
260 ret = 1;
261 }
262 catch(...)
263 {
264 cout << "Unknown Exception" << endl;
265 ret = 1;
266 }
267
268 PDFNet::Terminate();
269 return ret;
270}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "strconv"
10 "os"
11 . "pdftron"
12)
13
14import "pdftron/Samples/LicenseKey/GO"
15
16//---------------------------------------------------------------------------------------
17// This sample explores the structure and content of a tagged PDF document and dumps
18// the structure information to the console window.
19//
20// In tagged PDF documents StructTree acts as a central repository for information
21// related to a PDF document's logical structure. The tree consists of StructElement-s
22// and ContentItem-s which are leaf nodes of the structure tree.
23//
24// The sample can be extended to access and extract the marked-content elements such
25// as text and images.
26//---------------------------------------------------------------------------------------
27
28func PrintIndent(indent int){
29 os.Stdout.Write([]byte("\n"))
30 i := 0
31 for i < indent{
32 os.Stdout.Write([]byte(" "))
33 i = i + 1
34 }
35}
36
37func ProcessStructElement(element SElement, indent int){
38 if !element.IsValid(){
39 return
40 }
41
42 // Print out the type and title info, if any.
43 PrintIndent(indent)
44 indent = indent + 1
45 os.Stdout.Write([]byte("Type: " + element.GetType()))
46 if element.HasTitle(){
47 os.Stdout.Write([]byte(". Title:" + element.GetTitle()))
48 }
49 num := element.GetNumKids()
50 i := 0
51 for i < num{
52 // Check if the kid is a leaf node (i.e. it is a ContentItem)
53 if element.IsContentItem(i){
54 cont := element.GetAsContentItem(i)
55 etype := cont.GetType()
56
57 page := cont.GetPage()
58
59 PrintIndent(indent)
60 os.Stdout.Write([]byte("Content Item. Part of page //" + strconv.Itoa(page.GetIndex())))
61 PrintIndent(indent)
62 if etype == ContentItemE_MCID{
63 os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
64 }else if etype == ContentItemE_MCR{
65 os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
66 }else if etype == ContentItemE_OBJR{
67 os.Stdout.Write([]byte("OBJR "))
68 refObj := cont.GetRefObj()
69 if refObj != nil{
70 os.Stdout.Write([]byte("- Referenced Object//: " + strconv.Itoa(int(refObj.GetObjNum()))))
71 }
72 }
73 }else{
74 ProcessStructElement(element.GetAsStructElem(i), indent)
75 }
76 i = i + 1
77 }
78}
79
80// Used in code snippet 3.
81func ProcessElements2(reader ElementReader, mcidPageMap map[int]string){
82 element := reader.Next()
83 for element.GetMp_elem().Swigcptr() != 0{ // Read page contents
84 // In this sample we process only text, but the code can be extended
85 // to handle paths, images, or other Element type.
86 mcid := element.GetStructMCID()
87
88 if mcid >= 0 && element.GetType() == ElementE_text{
89 val := element.GetTextString()
90 if _, ok := mcidPageMap[mcid]; ok {
91 mcidPageMap[mcid] = mcidPageMap[mcid] + val
92 }else{
93 mcidPageMap[mcid] = val
94 }
95 }
96 element = reader.Next()
97 }
98}
99
100// Used in code snippet 2.
101func ProcessElements(reader ElementReader){
102 element := reader.Next()
103 for element.GetMp_elem().Swigcptr() != 0{ // Read page contents
104 // In this sample we process only paths & text, but the code can be
105 // extended to handle any element type.
106 etype := element.GetType()
107 if (etype == ElementE_path ||
108 etype == ElementE_text ||
109 etype == ElementE_path){
110 if etype == ElementE_path{ // Process path ...
111 os.Stdout.Write([]byte("\nPATH: "))
112 }else if etype == ElementE_text{ // Process text ...
113 os.Stdout.Write([]byte("\nTEXT: " + element.GetTextString() + "\n"))
114 }else if etype == ElementE_path{ // Process from XObjects
115 os.Stdout.Write([]byte("\nFORM XObject: "))
116 }
117
118 // Check if the element is associated with any structural element.
119 // Content items are leaf nodes of the structure tree.
120 structParent := element.GetParentStructElement()
121 if structParent.IsValid(){
122 // Print out the parent structural element's type, title, and object number.
123 os.Stdout.Write([]byte(" Type: " + structParent.GetType() + ", MCID: " + strconv.Itoa(element.GetStructMCID())))
124 if structParent.HasTitle(){
125 os.Stdout.Write([]byte(". Title: " + structParent.GetTitle()))
126 }
127 os.Stdout.Write([]byte(", Obj//: " + strconv.Itoa(int(structParent.GetSDFObj().GetObjNum()))))
128 }
129 }
130 element = reader.Next()
131 }
132}
133
134func ProcessStructElement2(element SElement, mcidDocMap map[int](map[int]string), indent int){
135 if !element.IsValid(){
136 return
137 }
138 // Print out the type and title info, if any
139 PrintIndent(indent)
140 os.Stdout.Write([]byte("<" + element.GetType()))
141 if element.HasTitle(){
142 os.Stdout.Write([]byte(" title=\"" + element.GetTitle() + "\""))
143 }
144 os.Stdout.Write([]byte(">"))
145
146 num := element.GetNumKids()
147 i := 0
148 for i < num{
149 if element.IsContentItem(i){
150 cont := element.GetAsContentItem(i)
151 if cont.GetType() == ContentItemE_MCID{
152 pageNum := cont.GetPage().GetIndex()
153 if _, ok := mcidDocMap[pageNum]; ok{
154 mcidPageMap := mcidDocMap[pageNum]
155 mcidKey := cont.GetMCID()
156 if _, ok := mcidPageMap[mcidKey]; ok{
157 os.Stdout.Write([]byte(mcidPageMap[mcidKey]))
158 }
159 }
160 }
161 }else{ // the kid is another StructElement node.
162 ProcessStructElement2(element.GetAsStructElem(i), mcidDocMap, indent+1)
163 }
164 i = i + 1
165 }
166 PrintIndent(indent)
167 os.Stdout.Write([]byte("</" + element.GetType() + ">"))
168}
169
170func main(){
171 PDFNetInitialize(PDFTronLicense.Key)
172
173 // Relative path to the folder containing the test files.
174 inputPath := "../../TestFiles/"
175 outputPath := "../../TestFiles/Output/"
176
177 // Extract logical structure from a PDF document
178 doc := NewPDFDoc(inputPath + "tagged.pdf")
179 doc.InitSecurityHandler()
180
181 fmt.Println("____________________________________________________________")
182 fmt.Println("Sample 1 - Traverse logical structure tree...")
183
184 tree := doc.GetStructTree()
185 if tree.IsValid(){
186 fmt.Println("Document has a StructTree root.")
187
188 i := 0
189 for i < tree.GetNumKids(){
190 // Recursively get structure info for all child elements.
191 ProcessStructElement(tree.GetKid(i), 0)
192 i = i + 1
193 }
194 }else{
195 fmt.Println("This document does not contain any logical structure.")
196 }
197
198 fmt.Println("\nDone 1.")
199
200 fmt.Println("____________________________________________________________")
201 fmt.Println("Sample 2 - Get parent logical structure elements from")
202 fmt.Println("layout elements.")
203
204 reader := NewElementReader()
205 itr := doc.GetPageIterator()
206 for itr.HasNext(){
207 reader.Begin(itr.Current())
208 ProcessElements(reader)
209 reader.End()
210 itr.Next()
211 }
212
213 fmt.Println("\nDone 2.")
214
215 fmt.Println("____________________________________________________________")
216 fmt.Println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
217 // A map which maps page numbers(as Integers)
218 // to page Maps(which map from struct mcid(as Integers) to
219 // text Strings)
220 var mcidDocMap = make(map[int](map[int]string))
221 reader = NewElementReader()
222 itr = doc.GetPageIterator()
223 for itr.HasNext(){
224 reader.Begin(itr.Current())
225 var pageMcidMap = make(map[int]string)
226 mcidDocMap[itr.Current().GetIndex()] = pageMcidMap
227 ProcessElements2(reader, pageMcidMap)
228 reader.End()
229 itr.Next()
230 }
231 tree = doc.GetStructTree()
232 if tree.IsValid(){
233 i := 0
234 for i < tree.GetNumKids(){
235 ProcessStructElement2(tree.GetKid(i), mcidDocMap, 0)
236 i = i + 1
237 }
238 }
239 fmt.Println("\nDone 3.")
240 doc.Save(outputPath + "LogicalStructure.pdf", uint(SDFDocE_linearized))
241 doc.Close()
242 PDFNetTerminate()
243}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.util.Map;
7import java.util.TreeMap;
8
9import com.pdftron.common.PDFNetException;
10import com.pdftron.pdf.struct.*;
11import com.pdftron.pdf.*;
12import com.pdftron.sdf.*;
13
14//---------------------------------------------------------------------------------------
15// This sample explores the structure and content of a tagged PDF document and dumps
16// the structure information to the console window.
17//
18// In tagged PDF documents StructTree acts as a central repository for information
19// related to a PDF document's logical structure. The tree consists of StructElement-s
20// and ContentItem-s which are leaf nodes of the structure tree.
21//
22// The sample can be extended to access and extract the marked-content elements such
23// as text and images.
24//---------------------------------------------------------------------------------------
25public class LogicalStructureTest {
26 static void PrintIndent(int indent) {
27 System.out.println();
28 for (int i = 0; i < indent; ++i) System.out.print(" ");
29 }
30
31 // Used in code snippet 1.
32 static void ProcessStructElement(SElement element, int indent) throws PDFNetException {
33 if (!element.isValid()) {
34 return;
35 }
36
37 // Print out the type and title info, if any.
38 PrintIndent(indent++);
39 System.out.print("Type: " + element.getType());
40 if (element.hasTitle()) {
41 System.out.print(". Title: " + element.getTitle());
42 }
43
44 int num = element.getNumKids();
45 for (int i = 0; i < num; ++i) {
46 // Check is the kid is a leaf node (i.e. it is a ContentItem).
47 if (element.isContentItem(i)) {
48 ContentItem cont = element.getAsContentItem(i);
49 int type = cont.getType();
50
51 Page page = cont.getPage();
52
53 PrintIndent(indent);
54 System.out.print("Content Item. Part of page #" + page.getIndex());
55
56 PrintIndent(indent);
57 switch (type) {
58 case ContentItem.e_MCID:
59 case ContentItem.e_MCR:
60 System.out.print("MCID: " + cont.getMCID());
61 break;
62 case ContentItem.e_OBJR: {
63 System.out.print("OBJR ");
64 Obj ref_obj = cont.getRefObj();
65 if (ref_obj != null)
66 System.out.print("- Referenced Object#: " + ref_obj.getObjNum());
67 }
68 break;
69 default:
70 break;
71 }
72 } else { // the kid is another StructElement node.
73 ProcessStructElement(element.getAsStructElem(i), indent);
74 }
75 }
76 }
77
78 // Used in code snippet 2.
79 static void ProcessElements(ElementReader reader) throws PDFNetException {
80 Element element;
81 while ((element = reader.next()) != null) // Read page contents
82 {
83 // In this sample we process only paths & text, but the code can be
84 // extended to handle any element type.
85 int type = element.getType();
86 if (type == Element.e_path || type == Element.e_text || type == Element.e_path) {
87 switch (type) {
88 case Element.e_path: // Process path ...
89 System.out.print("\nPATH: ");
90 break;
91 case Element.e_text: // Process text ...
92 System.out.print("\nTEXT: " + element.getTextString() + "\n");
93 break;
94 case Element.e_form: // Process form XObjects
95 System.out.print("\nFORM XObject: ");
96 //reader.FormBegin();
97 //ProcessElements(reader);
98 //reader.End();
99 break;
100 }
101
102 // Check if the element is associated with any structural element.
103 // Content items are leaf nodes of the structure tree.
104 SElement struct_parent = element.getParentStructElement();
105 if (struct_parent.isValid()) {
106 // Print out the parent structural element's type, title, and object number.
107 System.out.print(" Type: " + struct_parent.getType()
108 + ", MCID: " + element.getStructMCID());
109 if (struct_parent.hasTitle()) {
110 System.out.print(". Title: " + struct_parent.getTitle());
111 }
112 System.out.print(", Obj#: " + struct_parent.getSDFObj().getObjNum());
113 }
114 }
115 }
116 }
117
118 // Used in code snippet 3.
119 //typedef map<int, string> MCIDPageMap;
120 //typedef map<int, MCIDPageMap> MCIDDocMap;
121
122 // Used in code snippet 3.
123 static void ProcessElements2(ElementReader reader, Map<Integer, String> mcid_page_map) throws PDFNetException {
124 Element element;
125 while ((element = reader.next()) != null) // Read page contents
126 {
127 // In this sample we process only text, but the code can be extended
128 // to handle paths, images, or any other Element type.
129 int mcid = element.getStructMCID();
130 Integer key_mcid = new Integer(mcid);
131 if (mcid >= 0 && element.getType() == Element.e_text) {
132 String val = element.getTextString();
133 if (mcid_page_map.containsKey(key_mcid))
134 mcid_page_map.put(key_mcid, ((String) (mcid_page_map.get(key_mcid)) + val));
135 else mcid_page_map.put(key_mcid, val);
136 }
137 }
138 }
139
140 // Used in code snippet 3.
141 static void ProcessStructElement2(SElement element, Map<Integer, Map<Integer, String>> mcid_doc_map, int indent) throws PDFNetException {
142 if (!element.isValid()) {
143 return;
144 }
145
146 // Print out the type and title info, if any.
147 PrintIndent(indent);
148 System.out.print("<" + element.getType());
149 if (element.hasTitle()) {
150 System.out.print(" title=\"" + element.getTitle() + "\"");
151 }
152 System.out.print(">");
153
154 int num = element.getNumKids();
155 for (int i = 0; i < num; ++i) {
156 if (element.isContentItem(i)) {
157 ContentItem cont = element.getAsContentItem(i);
158 if (cont.getType() == ContentItem.e_MCID) {
159 int page_num = cont.getPage().getIndex();
160 Integer page_num_key = new Integer(page_num);
161 if (mcid_doc_map.containsKey(page_num_key)) {
162 Map<Integer, String> mcid_page_map = mcid_doc_map.get(page_num_key);
163 Integer mcid_key = new Integer(cont.getMCID());
164 if (mcid_page_map.containsKey(mcid_key)) {
165 System.out.print(mcid_page_map.get(mcid_key));
166 }
167 }
168 }
169 } else { // the kid is another StructElement node.
170 ProcessStructElement2(element.getAsStructElem(i), mcid_doc_map, indent + 1);
171 }
172 }
173
174 PrintIndent(indent);
175 System.out.print("</" + element.getType() + ">");
176 }
177
178
179 /**
180 * @param args
181 */
182 public static void main(String[] args) {
183 PDFNet.initialize(PDFTronLicense.Key());
184
185 // Relative path to the folder containing test files.
186 String input_path = "../../TestFiles/";
187 String output_path = "../../TestFiles/Output/";
188
189 try (PDFDoc doc = new PDFDoc((input_path + "tagged.pdf"))) // Extract logical structure from a PDF document
190 {
191 doc.initSecurityHandler();
192
193 System.out.println("____________________________________________________________");
194 System.out.println("Sample 1 - Traverse logical structure tree...");
195 {
196 STree tree = doc.getStructTree();
197 if (tree.isValid()) {
198 System.out.println("Document has a StructTree root.");
199
200 for (int i = 0; i < tree.getNumKids(); ++i) {
201 // Recursively get structure info for all all child elements.
202 ProcessStructElement(tree.getKid(i), 0);
203 }
204 } else {
205 System.out.println("This document does not contain any logical structure.");
206 }
207 }
208 System.out.println("\nDone 1.");
209
210 System.out.println("____________________________________________________________");
211 System.out.println("Sample 2 - Get parent logical structure elements from");
212 System.out.println("layout elements.");
213 {
214 ElementReader reader = new ElementReader();
215 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
216 reader.begin(itr.next());
217 ProcessElements(reader);
218 reader.end();
219 }
220 }
221 System.out.println("\nDone 2.");
222
223 System.out.println("____________________________________________________________");
224 System.out.println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
225 {
226 //A map which maps page numbers(as Integers)
227 //to page Maps(which map from struct mcid(as Integers) to
228 //text Strings)
229 Map<Integer, Map<Integer, String>> mcid_doc_map = new TreeMap<Integer, Map<Integer, String>>();
230 ElementReader reader = new ElementReader();
231 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
232 Page current = itr.next();
233 reader.begin(current);
234 Map<Integer, String> page_mcid_map = new TreeMap<Integer, String>();
235 mcid_doc_map.put(new Integer(current.getIndex()), page_mcid_map);
236 ProcessElements2(reader, page_mcid_map);
237 reader.end();
238 }
239
240 STree tree = doc.getStructTree();
241 if (tree.isValid()) {
242 for (int i = 0; i < tree.getNumKids(); ++i) {
243 ProcessStructElement2(tree.getKid(i), mcid_doc_map, 0);
244 }
245 }
246 }
247 System.out.println("\nDone 3.");
248 doc.save((output_path + "LogicalStructure.pdf"), SDFDoc.SaveMode.LINEARIZED, null);
249 } catch (Exception e) {
250 e.printStackTrace();
251 }
252
253 PDFNet.terminate();
254 }
255
256}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// This sample explores the structure and content of a tagged PDF document and dumps
8// the structure information to the console window.
9//
10// In tagged PDF documents StructTree acts as a central repository for information
11// related to a PDF document's logical structure. The tree consists of StructElement-s
12// and ContentItem-s which are leaf nodes of the structure tree.
13//
14// The sample can be extended to access and extract the marked-content elements such
15// as text and images.
16//---------------------------------------------------------------------------------------
17
18
19const { PDFNet } = require('@pdftron/pdfnet-node');
20const PDFTronLicense = require('../LicenseKey/LicenseKey');
21
22((exports) => {
23
24 exports.runLogicalStructureTest = () => {
25
26 const printAndIndent = (printState, indent) => {
27 console.log(printState.str);
28
29 let indentStr = '';
30 for (let i = 0; i < indent; ++i) {
31 indentStr += ' ';
32 }
33 printState.str = indentStr;
34 };
35
36 // Used in code snippet 1.
37 const processStructElement = async(element, indent, printState) => {
38 if (!(await element.isValid())) {
39 return;
40 }
41
42
43 // Print out the type and title info, if any.
44 printAndIndent(printState, indent++);
45 printState.str += 'Type: ' + (await element.getType());
46 if (await element.hasTitle()) {
47 printState.str += '. Title: ' + (await element.getTitle());
48 }
49
50 const num = await element.getNumKids();
51 for (let i = 0; i < num; ++i) {
52 // Check is the kid is a leaf node (i.e. it is a ContentItem).
53 if (await element.isContentItem(i)) {
54 const cont = await element.getAsContentItem(i);
55 const type = await cont.getType();
56
57 const page = await cont.getPage();
58
59 printAndIndent(printState, indent);
60 printState.str += 'Content Item. Part of page #' + (await page.getIndex());
61
62 printAndIndent(printState, indent);
63 switch (type) {
64 case PDFNet.ContentItem.Type.e_MCID:
65 case PDFNet.ContentItem.Type.e_MCR:
66 printState.str += 'MCID: ' + (await cont.getMCID());
67 break;
68 case PDFNet.ContentItem.Type.e_OBJR:
69 {
70 printState.str += 'OBJR ';
71 const refObj = await cont.getRefObj();
72 if (refObj) {
73 printState.str += '- Referenced Object#: ' + refObj.getObjNum();
74 }
75 }
76 break;
77 default:
78 break;
79 }
80 } else { // the kid is another StructElement node.
81 await processStructElement(await element.getAsStructElem(i), indent, printState);
82 }
83 }
84 };
85
86 // Used in code snippet 2.
87 const processElements = async(reader, printState) => {
88 let element;
89 while (element = await reader.next()) { // Read page contents
90 // In this sample we process only paths & text, but the code can be
91 // extended to handle any element type.
92 const type = await element.getType();
93 if (type === PDFNet.Element.Type.e_path || type === PDFNet.Element.Type.e_text || type === PDFNet.Element.Type.e_path) {
94 switch (type) {
95 case PDFNet.Element.Type.e_path: // Process path ...
96 printState.str += '\nPATH: ';
97 break;
98 case PDFNet.Element.Type.e_text: // Process text ...
99 printState.str += '\nTEXT: ' + (await element.getTextString()) + '\n';
100 break;
101 case PDFNet.Element.Type.e_form: // Process form XObjects
102 printState.str += '\nFORM XObject: ';
103 // reader.formBegin();
104 // await ProcessElements(reader);
105 // reader.end();
106 break;
107 }
108
109 // Check if the element is associated with any structural element.
110 // Content items are leaf nodes of the structure tree.
111 const structParent = await element.getParentStructElement();
112 if (await structParent.isValid()) {
113 // Print out the parent structural element's type, title, and object number.
114 printState.str += ' Type: ' + (await structParent.getType()) + ', MCID: ' + (await element.getStructMCID());
115 if (await structParent.hasTitle()) {
116 printState.str += '. Title: ' + (await structParent.getTitle());
117 }
118 printState.str += ', Obj#: ' + (await (await structParent.getSDFObj()).getObjNum());
119 }
120 }
121 }
122 };
123
124 // Used in code snippet 3.
125 const processElements2 = async(reader, mcidPageMap) => {
126 let element;
127 while (element = await reader.next()) { // Read page contents
128 // In this sample we process only text, but the code can be extended
129 // to handle paths, images, or any other Element type.
130 const mcid = await element.getStructMCID();
131 if (mcid >= 0 && (await element.getType()) === PDFNet.Element.Type.e_text) {
132 const val = await element.getTextString();
133 if (mcid in mcidPageMap) {
134 mcidPageMap[mcid] += val;
135 } else {
136 mcidPageMap[mcid] = val;
137 }
138 }
139 }
140 };
141
142 // Used in code snippet 3.
143 const processStructElement2 = async(element, mcidDocMap, indent, printState) => {
144 if (!(await element.isValid())) {
145 return;
146 }
147
148 // Print out the type and title info, if any.
149 printAndIndent(printState, indent);
150 printState.str += '<' + (await element.getType());
151 if (await element.hasTitle()) {
152 printState.str += ' title="' + (await element.getTitle()) + '"';
153 }
154 printState.str += '>';
155
156 const num = await element.getNumKids();
157 for (let i = 0; i < num; ++i) {
158 if (await element.isContentItem(i)) {
159 const cont = await element.getAsContentItem(i);
160 if ((await cont.getType()) === PDFNet.ContentItem.Type.e_MCID) {
161 const pageNum = await (await cont.getPage()).getIndex();
162 const mcidPageMap = mcidDocMap[pageNum];
163 if (mcidPageMap) {
164 const mcid = await cont.getMCID();
165 if (mcid in mcidPageMap) {
166 printState.str += mcidPageMap[mcid];
167 }
168 }
169 }
170 } else { // the kid is another StructElement node.
171 await processStructElement2(await element.getAsStructElem(i), mcidDocMap, indent + 1, printState);
172 }
173 }
174
175 printAndIndent(printState, indent);
176 printState.str += '</' + (await element.getType()) + '>';
177 };
178
179 const main = async() => {
180 // Relative path to the folder containing test files.
181 const inputPath = '../TestFiles/';
182 const printState = { str: '' };
183 try { // Extract logical structure from a PDF document
184 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'tagged.pdf');
185 doc.initSecurityHandler();
186
187 let reader = null;
188 let tree = null;
189
190 console.log('____________________________________________________________');
191 console.log('Sample 1 - Traverse logical structure tree...');
192 tree = await doc.getStructTree();
193 if (await tree.isValid()) {
194 console.log('Document has a StructTree root.');
195 for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
196 // Recursively get structure info for all child elements.
197 await processStructElement(await tree.getKid(i), 0, printState);
198 }
199 } else {
200 console.log('This document does not contain any logical structure.');
201 }
202 printAndIndent(printState, 0);
203 console.log('Done 1.');
204
205 console.log('____________________________________________________________');
206 console.log('Sample 2 - Get parent logical structure elements from');
207 console.log('layout elements.');
208 reader = await PDFNet.ElementReader.create();
209 for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
210 reader.beginOnPage(await itr.current());
211 await processElements(reader, printState);
212 reader.end();
213 }
214 printAndIndent(printState, 0);
215 console.log('Done 2.');
216
217 console.log('____________________________________________________________');
218 console.log("Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
219 {
220 const mcidDocMap = {};
221 for (let itr = await doc.getPageIterator(); await itr.hasNext(); itr.next()) {
222 const page = await itr.current();
223 reader.beginOnPage(page);
224 const pageNum = await page.getIndex();
225 const pageMcidMap = {};
226 mcidDocMap[pageNum] = pageMcidMap;
227 await processElements2(reader, pageMcidMap);
228 reader.end();
229 }
230
231 tree = await doc.getStructTree();
232 if (await tree.isValid()) {
233 for (let i = 0, numKids = await tree.getNumKids(); i < numKids; ++i) {
234 await processStructElement2(await tree.getKid(i), mcidDocMap, 0, printState);
235 }
236 }
237 }
238 printAndIndent(printState, 0);
239 console.log('Done 3.');
240 await doc.save(inputPath + 'Output/LogicalStructure.pdf', 0);
241 } catch (err) {
242 console.log(err);
243 }
244 };
245
246 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function(error){console.log('Error: ' + JSON.stringify(error));}).then(function(){return PDFNet.shutdown();});
247 };
248 exports.runLogicalStructureTest();
249})(exports);
250// eslint-disable-next-line spaced-comment
251//# sourceURL=LogicalStructureTest.js
1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/";
12$output_path = $input_path."Output/";
13
14//---------------------------------------------------------------------------------------
15// This sample explores the structure and content of a tagged PDF document and dumps
16// the structure information to the console window.
17//
18// In tagged PDF documents StructTree acts as a central repository for information
19// related to a PDF document's logical structure. The tree consists of StructElement-s
20// and ContentItem-s which are leaf nodes of the structure tree.
21//
22// The sample can be extended to access and extract the marked-content elements such
23// as text and images.
24//---------------------------------------------------------------------------------------
25
26function PrintIdent($ident) { echo nl2br("\n"); for ($i=0; $i<$ident; ++$i) echo " "; }
27
28// Used in code snippet 1.
29function ProcessStructElement($element, $ident)
30{
31 if (!$element->IsValid()) {
32 return;
33 }
34
35 // Print out the type and title info, if any.
36 PrintIdent($ident++);
37 echo "Type: ".$element->GetType();
38 if ($element->HasTitle()) {
39 echo ". Title: ".$element->GetTitle();
40 }
41
42 $num = $element->GetNumKids();
43 for ($i=0; $i<$num; ++$i)
44 {
45 // Check is the kid is a leaf node (i.e. it is a ContentItem).
46 if ($element->IsContentItem($i)) {
47 $cont = $element->GetAsContentItem($i);
48 $type = $cont->GetType();
49
50 $page = $cont->GetPage();
51
52 PrintIdent($ident);
53 echo "Content Item. Part of page #".$page->GetIndex();
54
55 PrintIdent($ident);
56 switch ($type) {
57 case ContentItem::e_MCID:
58 case ContentItem::e_MCR:
59 echo "MCID: ".$cont->GetMCID();
60 break;
61 case ContentItem::e_OBJR:
62 {
63 echo "OBJR ";
64 if ($ref_obj = $cont->GetRefObj())
65 echo "- Referenced Object#: ".$ref_obj->GetObjNum();
66 }
67 break;
68 default:
69 break;
70 }
71 }
72 else { // the kid is another StructElement node.
73 ProcessStructElement($element->GetAsStructElem($i), $ident);
74 }
75 }
76}
77
78// Used in code snippet 2.
79function ProcessElements($reader)
80{
81 while ($element = $reader->Next()) // Read page contents
82 {
83 // In this sample we process only paths & text, but the code can be
84 // extended to handle any element type.
85 $type = $element->GetType();
86 if ($type == Element::e_path || $type == Element::e_text || $type == Element::e_path)
87 {
88 switch ($type) {
89 case Element::e_path: // Process path ...
90 echo nl2br("\nPATH: ");
91 break;
92 case Element::e_text: // Process text ...
93 echo nl2br("\nTEXT: ".$element->GetTextString()."\n");
94 break;
95 case Element::e_form: // Process form XObjects
96 echo nl2br("\nFORM XObject: ");
97 //$reader->FormBegin();
98 //ProcessElements($reader);
99 //$reader->End();
100 break;
101 }
102
103 // Check if the element is associated with any structural element.
104 // Content items are leaf nodes of the structure tree.
105 $struct_parent = $element->GetParentStructElement();
106 if ($struct_parent->IsValid()) {
107 // Print out the parent structural element's type, title, and object number.
108 echo " Type: ".$struct_parent->GetType()
109 .", MCID: ".$element->GetStructMCID();
110 if ($struct_parent->HasTitle()) {
111 echo ". Title: ".$struct_parent->GetTitle();
112 }
113 echo ", Obj#: ".$struct_parent->GetSDFObj()->GetObjNum();
114 }
115 }
116 }
117}
118
119// Used in code snippet 3.
120function ProcessElements2($reader, &$mcid_page_map)
121{
122 while (($element = $reader->Next()) != null) // Read page contents
123 {
124 // In this sample we process only text, but the code can be extended
125 // to handle paths, images, or any other Element type.
126 $mcid = $element->GetStructMCID();
127 if ($mcid>= 0 && $element->GetType() == Element::e_text) {
128 $val = $element->GetTextString();
129 $exist = array_key_exists($mcid, $mcid_page_map);
130 if ($exist == true) {
131 $mcid_page_map[$mcid] = $mcid_page_map[$mcid].$val;
132 }
133 else {
134 $mcid_page_map[$mcid] = $val;
135 }
136 }
137 }
138}
139
140// Used in code snippet 3.
141function ProcessStructElement2($element, &$mcid_doc_map, $ident)
142{
143 if (!$element->IsValid()) {
144 return;
145 }
146
147 // Print out the type and title info, if any.
148 PrintIdent($ident);
149 echo "<".$element->GetType();
150 if ($element->HasTitle()) {
151 echo " title=\"".$element->GetTitle()."\"";
152 }
153 echo ">";
154
155 $num = $element->GetNumKids();
156 for ($i=0; $i<$num; ++$i)
157 {
158 if ($element->IsContentItem($i)) {
159 $cont = $element->GetAsContentItem($i);
160 if ($cont->GetType() == ContentItem::e_MCID) {
161 $page_num = $cont->GetPage()->GetIndex();
162 if (array_key_exists($page_num, $mcid_doc_map)) {
163 $mcid_page_map = $mcid_doc_map[$page_num];
164 if (array_key_exists($cont->GetMCID(), $mcid_page_map)) {
165 echo $mcid_page_map[$cont->GetMCID()];
166 }
167 }
168 }
169 }
170 else { // the kid is another StructElement node.
171 ProcessStructElement2($element->GetAsStructElem($i), $mcid_doc_map, $ident+1);
172 }
173 }
174
175 PrintIdent($ident);
176 echo "</".$element->GetType().">";
177}
178
179 PDFNet::Initialize($LicenseKey);
180 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
181
182 // Extract logical structure from a PDF document
183
184 $doc = new PDFDoc($input_path."tagged.pdf");
185 $doc->InitSecurityHandler();
186
187 echo nl2br("____________________________________________________________\n");
188 echo nl2br("Sample 1 - Traverse logical structure tree...\n");
189
190 $tree = $doc->GetStructTree();
191 if ($tree->IsValid()) {
192 echo nl2br("Document has a StructTree root.\n");
193
194 for ($i=0; $i<$tree->GetNumKids(); ++$i) {
195 // Recursively get structure info for all child elements.
196 ProcessStructElement($tree->GetKid($i), 0);
197 }
198 }
199 else {
200 echo nl2br("This document does not contain any logical structure.\n");
201 }
202
203 echo nl2br("\nDone 1.\n");
204
205 echo nl2br("____________________________________________________________\n");
206 echo nl2br("Sample 2 - Get parent logical structure elements from\n");
207 echo nl2br("layout elements.\n");
208
209 $reader = new ElementReader();
210 for ($itr = $doc->GetPageIterator(); $itr->HasNext(); $itr->Next()) {
211 $reader->Begin($itr->Current());
212 ProcessElements($reader);
213 $reader->End();
214 }
215
216 echo nl2br("\nDone 2.\n");
217
218 echo nl2br("____________________________________________________________\n");
219 echo nl2br("Sample 3 - 'XML style' extraction of PDF logical structure and page content.\n");
220
221 $mcid_doc_map = array();
222 $reader = new ElementReader();
223 for ($itr = $doc->GetPageIterator(); $itr->HasNext(); $itr->Next()) {
224 $reader->Begin($itr->Current());
225 $mcid_doc_map[$itr->Current()->GetIndex()] = array();
226 ProcessElements2($reader, $mcid_doc_map[$itr->Current()->GetIndex()]);
227 $reader->End();
228 }
229 $tree = $doc->GetStructTree();
230 if ($tree->IsValid()) {
231 for ($i=0; $i<$tree->GetNumKids(); ++$i) {
232 ProcessStructElement2($tree->GetKid($i), $mcid_doc_map, 0);
233 }
234 }
235
236 echo nl2br("\nDone 3.\n");
237 $doc->Save(($output_path ."LogicalStructure.pdf"), SDFDoc::e_linearized);
238 $doc->Close();
239 PDFNet::Terminate();
240?>
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14#---------------------------------------------------------------------------------------
15# This sample explores the structure and content of a tagged PDF document and dumps
16# the structure information to the console window.
17#
18# In tagged PDF documents StructTree acts as a central repository for information
19# related to a PDF document's logical structure. The tree consists of StructElement-s
20# and ContentItem-s which are leaf nodes of the structure tree.
21#
22# The sample can be extended to access and extract the marked-content elements such
23# as text and images.
24#---------------------------------------------------------------------------------------
25
26def PrintIndent(indent):
27 sys.stdout.write("\n")
28 i = 0
29 while i < indent:
30 sys.stdout.write(" ")
31 i = i + 1
32
33def ProcessStructElement(element, indent):
34 if not element.IsValid():
35 return
36
37 # Print out the type and title info, if any.
38 PrintIndent(indent)
39 indent = indent + 1
40 sys.stdout.write("Type: " + element.GetType())
41 if element.HasTitle():
42 sys.stdout.write(". Title:" + element.GetTitle())
43
44 num = element.GetNumKids()
45 i = 0
46 while i < num:
47 # Check if the kid is a leaf node (i.e. it is a ContentItem)
48 if element.IsContentItem(i):
49 cont = element.GetAsContentItem(i)
50 type = cont.GetType()
51
52 page = cont.GetPage()
53
54 PrintIndent(indent)
55 sys.stdout.write("Content Item. Part of page #" + str(page.GetIndex()))
56 PrintIndent(indent)
57 if type == ContentItem.e_MCID:
58 sys.stdout.write("MCID: " + str(cont.GetMCID()))
59 elif type == ContentItem.e_MCR:
60 sys.stdout.write("MCID: " + str(cont.GetMCID()))
61 elif type == ContentItem.e_OBJR:
62 sys.stdout.write("OBJR ")
63 ref_obj = cont.GetRefObj()
64 if ref_obj != None:
65 sys.stdout.write("- Referenced Object#: " + str(ref_obj.GetObjNum()))
66 else:
67 ProcessStructElement(element.GetAsStructElem(i), indent)
68 i = i + 1
69
70
71# Used in code snippet 3.
72def ProcessElements2(reader, mcid_page_map):
73 element = reader.Next()
74 while element != None: # Read page contents
75 # In this sample we process only text, but the code can be extended
76 # to handle paths, images, or other Element type.
77 mcid = element.GetStructMCID()
78
79 if mcid>=0 and element.GetType() == Element.e_text:
80 val = element.GetTextString()
81
82 if mcid in mcid_page_map:
83 mcid_page_map[mcid] = str(mcid_page_map[mcid]) + val
84 else:
85 mcid_page_map[mcid] = val
86 element = reader.Next()
87
88# Used in code snippet 2.
89def ProcessElements(reader):
90 element = reader.Next()
91 while element != None: # Read page contents
92 # In this sample we process only paths & text, but the code can be
93 # extended to handle any element type.
94 type = element.GetType()
95 if (type == Element.e_path or
96 type == Element.e_text or
97 type == Element.e_path):
98 if type == Element.e_path: # Process path ...
99 sys.stdout.write("\nPATH: ")
100 elif type == Element.e_text: # Process text ...
101 sys.stdout.write("\nTEXT: " + element.GetTextString() + "\n")
102 elif type == Element.e_path: # Process from XObjects
103 sys.stdout.write("\nFORM XObject: ")
104
105 # Check if the element is associated with any structural element.
106 # Content items are leaf nodes of the structure tree.
107 struct_parent = element.GetParentStructElement()
108 if struct_parent.IsValid():
109 # Print out the parent structural element's type, title, and object number.
110 sys.stdout.write(" Type: " + str(struct_parent.GetType())
111 + ", MCID: " + str(element.GetStructMCID()))
112 if struct_parent.HasTitle():
113 sys.stdout.write(". Title: " + struct_parent.GetTitle())
114 sys.stdout.write(", Obj#: " + str(struct_parent.GetSDFObj().GetObjNum()))
115 element = reader.Next()
116
117
118def ProcessStructElement2(element, mcid_doc_map, indent):
119 if not element.IsValid():
120 return
121
122 # Print out the type and title info, if any
123 PrintIndent(indent)
124 sys.stdout.write("<" + element.GetType())
125 if element.HasTitle():
126 sys.stdout.write(" title=\"" + element.GetTitle() + "\"")
127 sys.stdout.write(">")
128
129 num = element.GetNumKids()
130 i = 0
131 while i < num:
132 if element.IsContentItem(i):
133 cont = element.GetAsContentItem(i)
134 if cont.GetType() == ContentItem.e_MCID:
135 page_num = cont.GetPage().GetIndex()
136 if page_num in mcid_doc_map:
137 mcid_page_map = mcid_doc_map[page_num]
138 mcid_key = cont.GetMCID()
139 if mcid_key in mcid_page_map:
140 sys.stdout.write(mcid_page_map[mcid_key])
141 else: # the kid is another StructElement node.
142 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)
143 i = i + 1
144 PrintIndent(indent)
145 sys.stdout.write("</" + element.GetType() + ">")
146
147
148def main():
149 PDFNet.Initialize(LicenseKey)
150
151 # Relative path to the folder containing the test files.
152 input_path = "../../TestFiles/"
153 output_path = "../../TestFiles/Output/"
154
155 # Extract logical structure from a PDF document
156 doc = PDFDoc(input_path + "tagged.pdf")
157 doc.InitSecurityHandler()
158
159 print("____________________________________________________________")
160 print("Sample 1 - Traverse logical structure tree...")
161
162 tree = doc.GetStructTree()
163 if tree.IsValid():
164 print("Document has a StructTree root.")
165
166 i = 0
167 while i<tree.GetNumKids():
168 # Recursively get structure info for all child elements.
169 ProcessStructElement(tree.GetKid(i), 0)
170 i = i + 1
171 else:
172 print("This document does not contain any logical structure.")
173
174 print("\nDone 1.")
175
176 print("____________________________________________________________")
177 print("Sample 2 - Get parent logical structure elements from")
178 print("layout elements.")
179
180 reader = ElementReader()
181 itr = doc.GetPageIterator()
182 while itr.HasNext():
183 reader.Begin(itr.Current())
184 ProcessElements(reader)
185 reader.End()
186 itr.Next()
187
188 print("\nDone 2.")
189
190 print("____________________________________________________________")
191 print("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
192 # A map which maps page numbers(as Integers)
193 # to page Maps(which map from struct mcid(as Integers) to
194 # text Strings)
195 mcid_doc_map = dict()
196 reader = ElementReader()
197 itr = doc.GetPageIterator()
198 while itr.HasNext():
199 reader.Begin(itr.Current())
200 page_mcid_map = dict()
201 mcid_doc_map[itr.Current().GetIndex()] = page_mcid_map
202 ProcessElements2(reader, page_mcid_map)
203 reader.End()
204 itr.Next()
205 tree = doc.GetStructTree()
206 if tree.IsValid():
207 i = 0
208 while i < tree.GetNumKids():
209 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
210 i = i + 1
211 print("\nDone 3.")
212 doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc.e_linearized)
213 doc.Close()
214 PDFNet.Terminate()
215
216if __name__ == '__main__':
217 main()
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# This sample explores the structure and content of a tagged PDF document and dumps
14# the structure information to the console window.
15#
16# In tagged PDF documents StructTree acts as a central repository for information
17# related to a PDF document's logical structure. The tree consists of StructElement-s
18# and ContentItem-s which are leaf nodes of the structure tree.
19#
20# The sample can be extended to access and extract the marked-content elements such
21# as text and images.
22#---------------------------------------------------------------------------------------
23
24def PrintIndent(indent)
25 print "\n"
26 i = 0
27 while i < indent
28 print " "
29 i = i + 1
30 end
31end
32
33def ProcessStructElement(element, indent)
34 if !element.IsValid
35 return
36 end
37
38 # Print out the type and title info, if any.
39 PrintIndent(indent)
40 indent = indent + 1
41 print "Type: " + element.GetType
42 if element.HasTitle
43 print ". Title:" + element.GetTitle
44 end
45
46 num = element.GetNumKids
47 i = 0
48 while i < num do
49 # Check if the kid is a leaf node (i.e. it is a ContentItem)
50 if element.IsContentItem(i)
51 cont = element.GetAsContentItem(i)
52 type = cont.GetType
53
54 page = cont.GetPage
55
56 PrintIndent(indent)
57 print "Content Item. Part of page #" + page.GetIndex.to_s
58 PrintIndent(indent)
59 case type
60 when ContentItem::E_MCID
61 print "MCID: " + cont.GetMCID.to_s
62 when ContentItem::E_MCR
63 print "MCID: " + cont.GetMCID.to_s
64 when ContentItem::E_OBJR
65 print "OBJR "
66 ref_obj = cont.GetRefObj
67 if !ref_obj.nil?
68 print "- Referenced Object#: " + ref_obj.GetObjNum.to_s
69 end
70 end
71 else
72 ProcessStructElement(element.GetAsStructElem(i), indent)
73 end
74 i = i + 1
75 end
76end
77
78# Used in code snippet 3.
79def ProcessElements2(reader)
80 mcid_page_map = Hash.new
81 element = reader.Next
82 while !element.nil? do # Read page contents
83 # In this sample we process only text, but the code can be extended
84 # to handle paths, images, or other Element type.
85 mcid = element.GetStructMCID
86
87 if mcid>=0 and element.GetType == Element::E_text
88 val = element.GetTextString
89
90 if mcid_page_map.has_key?(mcid)
91 mcid_page_map[mcid] = mcid_page_map[mcid].to_s + val
92 else
93 mcid_page_map[mcid] = val
94 end
95 end
96 element = reader.Next
97 end
98 return mcid_page_map
99end
100
101# Used in code snippet 2.
102def ProcessElements(reader)
103 element = reader.Next
104 while !element.nil? do # Read page contents
105 # In this sample we process only paths & text, but the code can be
106 # extended to handle any element type.
107 type = element.GetType
108 if (type == Element::E_path or
109 type == Element::E_text or
110 type == Element::E_path)
111 case type
112 when Element::E_path # Process path ...
113 print "\nPATH: "
114 when Element::E_text # Process text ...
115 print "\nTEXT: " + element.GetTextString + "\n"
116 when Element::E_path # Process from XObjects
117 print "\nFORM XObject: "
118 end
119
120 # Check if the element is associated with any structural element.
121 # Content items are leaf nodes of the structure tree.
122 struct_parent = element.GetParentStructElement
123 if struct_parent.IsValid
124 # Print out the parent structural element's type, title, and object number.
125 print " Type: " + struct_parent.GetType.to_s + ", MCID: " + element.GetStructMCID.to_s
126 if struct_parent.HasTitle
127 print ". Title: " + struct_parent.GetTitle
128 end
129 print ", Obj#: " + struct_parent.GetSDFObj.GetObjNum.to_s
130 end
131 end
132 element = reader.Next
133 end
134end
135
136def ProcessStructElement2(element, mcid_doc_map, indent)
137 if !element.IsValid
138 return
139 end
140
141 # Print out the type and title info, if any
142 PrintIndent(indent)
143 print "<" + element.GetType
144 if element.HasTitle
145 print " title=\"" + element.GetTitle + "\""
146 end
147 print ">"
148
149 num = element.GetNumKids
150 i = 0
151 while i < num do
152 if element.IsContentItem(i)
153 cont = element.GetAsContentItem(i)
154 if cont.GetType == ContentItem::E_MCID
155 page_num = cont.GetPage.GetIndex
156 if mcid_doc_map.has_key?(page_num)
157 mcid_page_map = mcid_doc_map[page_num]
158 mcid_key = cont.GetMCID
159 if mcid_page_map.has_key?(mcid_key)
160 print mcid_page_map[mcid_key]
161 end
162 end
163 end
164 else # the kid is another StructElement node.
165 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)
166 end
167 i = i + 1
168 end
169 PrintIndent(indent)
170 print "</" + element.GetType + ">"
171end
172
173 PDFNet.Initialize(PDFTronLicense.Key)
174
175 # Relative path to the folder containing the test files.
176 input_path = "../../TestFiles/"
177 output_path = "../../TestFiles/Output/"
178
179 # Extract logical structure from a PDF document
180 doc = PDFDoc.new(input_path + "tagged.pdf")
181 doc.InitSecurityHandler
182
183 puts "____________________________________________________________"
184 puts "Sample 1 - Traverse logical structure tree..."
185
186 tree = doc.GetStructTree
187 if tree.IsValid
188 puts "Document has a StructTree root."
189
190 i = 0
191 while i<tree.GetNumKids do
192 # Recursively get structure info for all child elements.
193 ProcessStructElement(tree.GetKid(i), 0)
194 i = i + 1
195 end
196 else
197 puts "This document does not contain any logical structure."
198 end
199
200 puts "\nDone 1."
201
202 puts "____________________________________________________________"
203 puts "Sample 2 - Get parent logical structure elements from"
204 puts "layout elements."
205
206 reader = ElementReader.new
207 itr = doc.GetPageIterator
208 while itr.HasNext do
209 reader.Begin(itr.Current)
210 ProcessElements(reader)
211 reader.End
212 itr.Next
213 end
214
215 puts "\nDone 2."
216
217 puts "____________________________________________________________"
218 puts "Sample 3 - 'XML style' extraction of PDF logical structure and page content."
219
220 # A map which maps page numbers(as Integers)
221 # to page Maps(which map from struct mcid(as Integers) to
222 # text Strings)
223
224 mcid_doc_map = Hash.new
225 reader = ElementReader.new
226 itr = doc.GetPageIterator
227 while itr.HasNext do
228 reader.Begin(itr.Current)
229 mcid_doc_map[itr.Current.GetIndex] = ProcessElements2(reader)
230 reader.End
231 itr.Next
232 end
233 tree = doc.GetStructTree
234 if tree.IsValid
235 i = 0
236 while i < tree.GetNumKids do
237 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
238 i = i + 1
239 end
240 end
241 puts "\nDone 3."
242 doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc::E_linearized)
243 doc.Close
244 PDFNet.Terminate
1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports System
6Imports System.Collections
7Imports pdftron
8Imports pdftron.Common
9Imports pdftron.Filters
10Imports pdftron.SDF
11Imports pdftron.PDF
12Imports pdftron.PDF.Struct
13
14Module LogicalStructureTestCS
15 Dim pdfNetLoader As PDFNetLoader
16 Sub New()
17 pdfNetLoader = pdftron.PDFNetLoader.Instance()
18 End Sub
19
20 Sub PrintIndent(ByVal indent As Integer)
21 Console.WriteLine()
22
23 For i As Integer = 0 To indent - 1
24 Console.Write(" ")
25 Next
26 End Sub
27
28 Sub ProcessStructElement(ByVal element As SElement, ByVal indent As Integer)
29 If Not element.IsValid() Then
30 Return
31 End If
32
33 PrintIndent(Math.Min(System.Threading.Interlocked.Increment(indent), indent - 1))
34 Console.Write("Type: " & element.[GetType]())
35
36 If element.HasTitle() Then
37 Console.Write(". Title: " & element.GetTitle())
38 End If
39
40 Dim num As Integer = element.GetNumKids()
41
42 For i As Integer = 0 To num - 1
43
44 If element.IsContentItem(i) Then
45 Dim cont As ContentItem = element.GetAsContentItem(i)
46 Dim type As ContentItem.Type = cont.[GetType]()
47 Dim page As Page = cont.GetPage()
48 PrintIndent(indent)
49 Console.Write("Content Item. Part of page #" & page.GetIndex())
50 PrintIndent(indent)
51
52 Select Case type
53 Case ContentItem.Type.e_MCID, ContentItem.Type.e_MCR
54 Console.Write("MCID: " & cont.GetMCID())
55 Case ContentItem.Type.e_OBJR
56 Console.Write("OBJR ")
57 Dim ref_obj As Obj = cont.GetRefObj()
58 If ref_obj IsNot Nothing Then Console.Write("- Referenced Object#: " & ref_obj.GetObjNum())
59 Case Else
60 End Select
61 Else
62 ProcessStructElement(element.GetAsStructElem(i), indent)
63 End If
64 Next
65 End Sub
66
67 Sub ProcessElements(ByVal reader As ElementReader)
68 Dim element As Element = reader.Next()
69 While Not IsNothing(element) ' Read page contents
70 Dim type As Element.Type = element.[GetType]()
71
72 If type = element.Type.e_path OrElse type = element.Type.e_text OrElse type = element.Type.e_path Then
73
74 Select Case type
75 Case element.Type.e_path
76 Console.WriteLine()
77 Console.Write("PATH: ")
78 Case element.Type.e_text
79 Console.WriteLine()
80 Console.WriteLine("TEXT: " & element.GetTextString())
81 Case element.Type.e_form
82 Console.WriteLine()
83 Console.Write("FORM XObject: ")
84 End Select
85
86 Dim struct_parent As SElement = element.GetParentStructElement()
87
88 If struct_parent.IsValid() Then
89 Console.Write(" Type: " & struct_parent.[GetType]() & ", MCID: " + String.Format("{0}", element.GetStructMCID()))
90
91 If struct_parent.HasTitle() Then
92 Console.Write(". Title: " & struct_parent.GetTitle())
93 End If
94
95 Console.Write(", Obj#: " & struct_parent.GetSDFObj().GetObjNum())
96 End If
97 End If
98 element = reader.Next()
99 End While
100 End Sub
101
102 Sub ProcessElements2(ByVal reader As ElementReader, ByVal mcid_page_map As Hashtable)
103 Dim element As Element = reader.Next()
104 While Not IsNothing(element) ' Read page contents
105 Dim mcid As Integer = element.GetStructMCID()
106
107 If mcid >= 0 AndAlso element.[GetType]() = element.Type.e_text Then
108 Dim val As String = element.GetTextString()
109
110 If mcid_page_map.ContainsKey(mcid) Then
111 mcid_page_map(mcid) = (CStr((mcid_page_map(mcid))) & val)
112 Else
113 mcid_page_map.Add(mcid, val)
114 End If
115 End If
116 element = reader.Next()
117 End While
118 End Sub
119
120 Sub ProcessStructElement2(ByVal element As SElement, ByVal mcid_doc_map As Hashtable, ByVal indent As Integer)
121 If Not element.IsValid() Then
122 Return
123 End If
124
125 PrintIndent(indent)
126 Console.Write("<" & element.[GetType]())
127
128 If element.HasTitle() Then
129 Console.Write(" title=""" & element.GetTitle() & """")
130 End If
131
132 Console.Write(">")
133 Dim num As Integer = element.GetNumKids()
134
135 For i As Integer = 0 To num - 1
136
137 If element.IsContentItem(i) Then
138 Dim cont As ContentItem = element.GetAsContentItem(i)
139
140 If cont.[GetType]() = ContentItem.Type.e_MCID Then
141 Dim page_num As Integer = cont.GetPage().GetIndex()
142
143 If mcid_doc_map.ContainsKey(page_num) Then
144 Dim mcid_page_map As Hashtable = CType((mcid_doc_map(page_num)), Hashtable)
145 Dim mcid As Integer = cont.GetMCID()
146
147 If mcid_page_map.ContainsKey(mcid) Then
148 Console.Write(mcid_page_map(mcid))
149 End If
150 End If
151 End If
152 Else
153 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent + 1)
154 End If
155 Next
156
157 PrintIndent(indent)
158 Console.Write("</" & element.[GetType]() & ">")
159 End Sub
160
161
162 Sub Main(ByVal args As String())
163 PDFNet.Initialize(PDFTronLicense.Key)
164 Dim input_path As String = "../../../../TestFiles/"
165 Dim output_path As String = "../../../../TestFiles/Output/"
166
167 Try
168
169 Using doc As PDFDoc = New PDFDoc(input_path & "tagged.pdf")
170 doc.InitSecurityHandler()
171 Dim example1 As Boolean = True
172 Dim example2 As Boolean = True
173 Dim example3 As Boolean = True
174
175 If example1 Then
176 Console.WriteLine("____________________________________________________________")
177 Console.WriteLine("Sample 1 - Traverse logical structure tree...")
178 Dim tree As STree = doc.GetStructTree()
179
180 If tree.IsValid() Then
181 Console.WriteLine("Document has a StructTree root.")
182
183 For i As Integer = 0 To tree.GetNumKids() - 1
184 ProcessStructElement(tree.GetKid(i), 0)
185 Next
186 Else
187 Console.WriteLine("This document does not contain any logical structure.")
188 End If
189
190 Console.WriteLine()
191 Console.WriteLine("Done 1.")
192 End If
193
194 If example2 Then
195 Console.WriteLine("____________________________________________________________")
196 Console.WriteLine("Sample 2 - Get parent logical structure elements from")
197 Console.WriteLine("layout elements.")
198 Dim reader As ElementReader = New ElementReader()
199 Dim itr As PageIterator = doc.GetPageIterator()
200
201 While itr.HasNext()
202 reader.Begin(itr.Current())
203 ProcessElements(reader)
204 reader.[End]()
205 itr.[Next]()
206 End While
207
208 Console.WriteLine()
209 Console.WriteLine("Done 2.")
210 End If
211
212 If example3 Then
213 Console.WriteLine("____________________________________________________________")
214 Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
215 Dim mcid_doc_map As Hashtable = New Hashtable()
216 Dim reader As ElementReader = New ElementReader()
217 Dim itr As PageIterator = doc.GetPageIterator()
218
219 While itr.HasNext()
220 Dim pg As Page = itr.Current()
221 reader.Begin(pg)
222 Dim page_mcid_map As Hashtable = New Hashtable()
223 mcid_doc_map.Add(pg.GetIndex(), page_mcid_map)
224 ProcessElements2(reader, page_mcid_map)
225 reader.[End]()
226 itr.[Next]()
227 End While
228
229 Dim tree As STree = doc.GetStructTree()
230
231 If tree.IsValid() Then
232
233 For i As Integer = 0 To tree.GetNumKids() - 1
234 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
235 Next
236 End If
237
238 Console.WriteLine()
239 Console.WriteLine("Done 3.")
240 End If
241
242 doc.Save(output_path & "LogicalStructure.pdf", 0)
243 End Using
244
245 Catch e As PDFNetException
246 Console.WriteLine(e.Message)
247 End Try
248 PDFNet.Terminate()
249 End Sub
250
251End Module
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales