PDF Logical Structure Reader - C++ Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <iostream>
10#include <map>
11#include "../../LicenseKey/CPP/LicenseKey.h"
12
13using namespace pdftron;
14using namespace PDF;
15using namespace std;
16
17//---------------------------------------------------------------------------------------
18// This sample explores the structure and content of a tagged PDF document and dumps
19// the structure information to the console window.
20//
21// In tagged PDF documents StructTree acts as a central repository for information
22// related to a PDF document's logical structure. The tree consists of StructElement-s
23// and ContentItem-s which are leaf nodes of the structure tree.
24//
25// The sample can be extended to access and extract the marked-content elements such
26// as text and images.
27//---------------------------------------------------------------------------------------
28
29
30void PrintIndent(int indent) { cout << '\n'; for (int i=0; i<indent; ++i) cout << " "; }
31
32// Used in code snippet 1.
33void ProcessStructElement(Struct::SElement element, int ident)
34{
35 if (!element.IsValid()) {
36 return;
37 }
38
39 // Print out the type and title info, if any.
40 PrintIndent(ident++);
41 cout << "Type: "<< element.GetType();
42 if (element.HasTitle()) {
43 cout << ". Title: "<< element.GetTitle();
44 }
45
46 int num = element.GetNumKids();
47 for (int i=0; i<num; ++i)
48 {
49 // Check is the kid is a leaf node (i.e. it is a ContentItem).
50 if (element.IsContentItem(i)) {
51 Struct::ContentItem cont = element.GetAsContentItem(i);
52 Struct::ContentItem::Type type = cont.GetType();
53
54 Page page = cont.GetPage();
55
56 PrintIndent(ident);
57 cout << "Content Item. Part of page #" << page.GetIndex();
58
59 PrintIndent(ident);
60 switch (type) {
61 case Struct::ContentItem::e_MCID:
62 case Struct::ContentItem::e_MCR:
63 cout << "MCID: " << cont.GetMCID();
64 break;
65 case Struct::ContentItem::e_OBJR:
66 {
67 cout << "OBJR ";
68 if (SDF::Obj ref_obj = cont.GetRefObj())
69 cout << "- Referenced Object#: " << ref_obj.GetObjNum();
70 }
71 break;
72 default:
73 break;
74 }
75 }
76 else { // the kid is another StructElement node.
77 ProcessStructElement(element.GetAsStructElem(i), ident);
78 }
79 }
80}
81
82// Used in code snippet 2.
83void ProcessElements(ElementReader& reader)
84{
85 Element element;
86 while (element = reader.Next()) // Read page contents
87 {
88 // In this sample we process only paths & text, but the code can be
89 // extended to handle any element type.
90 Element::Type type = element.GetType();
91 if (type == Element::e_path || type == Element::e_text || type == Element::e_path)
92 {
93 switch (type) {
94 case Element::e_path: // Process path ...
95 cout << "\nPATH: ";
96 break;
97 case Element::e_text: // Process text ...
98 cout << "\nTEXT: " << element.GetTextString() << endl;
99 break;
100 case Element::e_form: // Process form XObjects
101 cout << "\nFORM XObject: ";
102 //reader.FormBegin();
103 //ProcessElements(reader);
104 //reader.End();
105 break;
106 }
107
108 // Check if the element is associated with any structural element.
109 // Content items are leaf nodes of the structure tree.
110 Struct::SElement struct_parent = element.GetParentStructElement();
111 if (struct_parent.IsValid()) {
112 // Print out the parent structural element's type, title, and object number.
113 cout << " Type: " << struct_parent.GetType()
114 << ", MCID: " << element.GetStructMCID();
115 if (struct_parent.HasTitle()) {
116 cout << ". Title: "<< struct_parent.GetTitle();
117 }
118 cout << ", Obj#: " << struct_parent.GetSDFObj().GetObjNum();
119 }
120 }
121 }
122}
123
124// Used in code snippet 3.
125typedef map<int, string> MCIDPageMap;
126typedef map<int, MCIDPageMap> MCIDDocMap;
127
128// Used in code snippet 3.
129void ProcessElements2(ElementReader& reader, MCIDPageMap& mcid_page_map)
130{
131 Element element;
132 while (element = reader.Next()) // Read page contents
133 {
134 // In this sample we process only text, but the code can be extended
135 // to handle paths, images, or any other Element type.
136 int mcid = element.GetStructMCID();
137 if (mcid>= 0 && element.GetType() == Element::e_text) {
138 string val = element.GetTextString().ConvertToAscii();
139 MCIDPageMap::iterator itr = mcid_page_map.find(mcid);
140 if (itr != mcid_page_map.end()) itr->second += val;
141 else mcid_page_map.insert(MCIDPageMap::value_type(mcid, val));
142 }
143 }
144}
145
146// Used in code snippet 3.
147void ProcessStructElement2(Struct::SElement element, MCIDDocMap& mcid_doc_map, int ident)
148{
149 if (!element.IsValid()) {
150 return;
151 }
152
153 // Print out the type and title info, if any.
154 PrintIndent(ident);
155 cout << "<" << element.GetType();
156 if (element.HasTitle()) {
157 cout << " title=\""<< element.GetTitle() << "\"";
158 }
159 cout << ">";
160
161 int num = element.GetNumKids();
162 for (int i=0; i<num; ++i)
163 {
164 if (element.IsContentItem(i)) {
165 Struct::ContentItem cont = element.GetAsContentItem(i);
166 if (cont.GetType() == Struct::ContentItem::e_MCID) {
167 int page_num = cont.GetPage().GetIndex();
168 MCIDDocMap::iterator itr = mcid_doc_map.find(page_num);
169 if (itr!=mcid_doc_map.end()) {
170 MCIDPageMap& mcid_page_map = itr->second;
171 MCIDPageMap::iterator itr2 = mcid_page_map.find(cont.GetMCID());
172 if (itr2 != mcid_page_map.end()) {
173 cout << itr2->second;
174 }
175 }
176 }
177 }
178 else { // the kid is another StructElement node.
179 ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, ident+1);
180 }
181 }
182
183 PrintIndent(ident);
184 cout << "</" << element.GetType() << ">";
185}
186
187
188int main(int argc, char *argv[])
189{
190 int ret = 0;
191 PDFNet::Initialize(LicenseKey);
192
193 // Relative path to the folder containing test files.
194 string input_path = "../../TestFiles/";
195 string output_path = "../../TestFiles/Output/";
196
197 try // Extract logical structure from a PDF document
198 {
199 PDFDoc doc((input_path + "tagged.pdf").c_str());
200 doc.InitSecurityHandler();
201
202 cout << "____________________________________________________________" << endl;
203 cout << "Sample 1 - Traverse logical structure tree..." << endl;
204 {
205 Struct::STree tree = doc.GetStructTree();
206 if (tree.IsValid()) {
207 cout << "Document has a StructTree root." << endl;
208
209 for (int i=0; i<tree.GetNumKids(); ++i) {
210 // Recursively get structure info for all child elements.
211 ProcessStructElement(tree.GetKid(i), 0);
212 }
213 }
214 else {
215 cout << "This document does not contain any logical structure." << endl;
216 }
217 }
218 cout << "\nDone 1." << endl;
219
220 cout << "____________________________________________________________" << endl;
221 cout << "Sample 2 - Get parent logical structure elements from" << endl;
222 cout << "layout elements." << endl;
223 {
224 ElementReader reader;
225 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) {
226 reader.Begin(itr.Current());
227 ProcessElements(reader);
228 reader.End();
229 }
230 }
231 cout << "\nDone 2." << endl;
232
233 cout << "____________________________________________________________" << endl;
234 cout << "Sample 3 - 'XML style' extraction of PDF logical structure and page content." << endl;
235 {
236 MCIDDocMap mcid_doc_map;
237 ElementReader reader;
238 for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) {
239 reader.Begin(itr.Current());
240 pair<MCIDDocMap::iterator, bool> r = mcid_doc_map.insert(MCIDDocMap::value_type(itr.Current().GetIndex(), MCIDPageMap()));
241 MCIDPageMap& page_mcid_map = (r.first)->second;
242 ProcessElements2(reader, page_mcid_map);
243 reader.End();
244 }
245
246 Struct::STree tree = doc.GetStructTree();
247 if (tree.IsValid()) {
248 for (int i=0; i<tree.GetNumKids(); ++i) {
249 ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0);
250 }
251 }
252 }
253 cout << "\nDone 3." << endl;
254
255 doc.Save(output_path + "LogicalStructure.pdf", 0);
256 }
257 catch(Common::Exception& e)
258 {
259 cout << e << endl;
260 ret = 1;
261 }
262 catch(...)
263 {
264 cout << "Unknown Exception" << endl;
265 ret = 1;
266 }
267
268 PDFNet::Terminate();
269 return ret;
270}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales