LogicalStructure

Sample Obj-C code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our iOS SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9//---------------------------------------------------------------------------------------
10// This sample explores the structure and content of a tagged PDF document and dumps
11// the structure information to the console window.
12//
13// In tagged PDF documents StructTree acts as a central repository for information
14// related to a PDF document's logical structure. The tree consists of StructElement-s
15// and ContentItem-s which are leaf nodes of the structure tree.
16//
17// The sample can be extended to access and extract the marked-content elements such
18// as text and images.
19//---------------------------------------------------------------------------------------
20
21
22NSString *PrintIdent(int ident) {
23 int i;
24 NSString * str= @"\n";
25 for (i=0; i<ident; ++i) {
26 str = [str stringByAppendingString: @" "];
27 }
28 return str;
29}
30
31// Used in code snippet 1.
32NSString* ProcessStructElement(PTSElement *element, int ident)
33{
34 if (![element IsValid]) {
35 return @"";
36 }
37
38 NSString *result = @"";
39 // Print out the type and title info, if any.
40 result = [result stringByAppendingFormat: @"%@Type: %@", PrintIdent(ident++), [element GetType]];
41 if ([element HasTitle]) {
42 result = [result stringByAppendingFormat: @". Title: %@", [element GetTitle]];
43 }
44
45 int num = [element GetNumKids];
46 int i;
47 for (i=0; i<num; ++i)
48 {
49 // Check is the kid is a leaf node (i.e. it is a ContentItem).
50 if ([element IsContentItem: i]) {
51 PTContentItem *cont = [element GetAsContentItem: i];
52 PTContentItemType type = [cont GetType];
53
54 PTPage *page = [cont GetPage];
55
56 result = [result stringByAppendingFormat: @"%@Content Item. Part of page #%d%@", PrintIdent(ident), [page GetIndex], PrintIdent(ident)];
57
58 switch (type) {
59 case e_ptMCID:
60 case e_ptMCR:
61 result = [result stringByAppendingFormat: @"MCID: %d", [cont GetMCID]];
62 break;
63 case e_ptOBJR:
64 {
65 result = [result stringByAppendingString: @"OBJR "];
66 PTObj *ref_obj;
67 if ((ref_obj = [cont GetRefObj]) != NULL)
68 result = [result stringByAppendingFormat: @"- Referenced Object#: %u", [ref_obj GetObjNum]];
69 }
70 break;
71 default:
72 break;
73 }
74 }
75 else { // the kid is another StructElement node.
76 result = [result stringByAppendingString: ProcessStructElement([element GetAsStructElem: i], ident)];
77 }
78 }
79 return result;
80}
81
82// Used in code snippet 2.
83NSString* ProcessLogicalStructureTestElements(PTElementReader *reader)
84{
85 PTElement *element;
86 NSString *result = @"";
87 while ((element = [reader Next]) != NULL) // Read page contents
88 {
89 // In this sample we process only paths & text, but the code can be
90 // extended to handle any element type.
91 PTElementType type = [element GetType];
92 if (type == e_ptpath || type == e_pttext_obj || type == e_ptpath)
93 {
94 switch (type) {
95 case e_ptpath: // Process path ...
96 result = [result stringByAppendingString: @"\nPATH: "];
97 break;
98 case e_pttext_obj: // Process text ...
99 result = [result stringByAppendingFormat: @"\nTEXT: %@\n", [element GetTextString]];
100 break;
101 case e_ptform: // Process form XObjects
102 result = [result stringByAppendingString: @"\nFORM XObject: "];
103 //reader.FormBegin();
104 //ProcessLogicalStructureTestElements(reader);
105 //reader.End();
106 break;
107 default:
108 break;
109 }
110
111 // Check if the element is associated with any structural element.
112 // Content items are leaf nodes of the structure tree.
113 PTSElement *struct_parent = [element GetParentStructElement];
114 if ([struct_parent IsValid]) {
115 // Print out the parent structural element's type, title, and object number.
116 result = [result stringByAppendingFormat: @" Type: %@, MCID: %d", [struct_parent GetType], [element GetStructMCID]];
117 if ([struct_parent HasTitle]) {
118 result = [result stringByAppendingFormat: @". Title: %@", [struct_parent GetTitle]];
119 }
120 result = [result stringByAppendingFormat: @", Obj#: %u", [[struct_parent GetSDFObj] GetObjNum]];
121 }
122 }
123 }
124 return result;
125}
126
127// Used in code snippet 3.
128//typedef map<int, string> MCIDPageMap;
129NSMutableDictionary *MCIDPageMap;
130NSMutableDictionary *MCIDDocMap;
131//typedef map<int, MCIDPageMap> MCIDDocMap;
132
133// Used in code snippet 3.
134void ProcessLogicalStructureTestElements2(PTElementReader *reader, NSMutableDictionary *mcid_page_map)
135{
136 PTElement *element;
137 while ((element = [reader Next]) != NULL) // Read page contents
138 {
139 // In this sample we process only text, but the code can be extended
140 // to handle paths, images, or any other Element type.
141 int mcid = [element GetStructMCID];
142 if (mcid>= 0 && [element GetType] == e_pttext_obj) {
143 NSString *val = [element GetTextString];
144 id key = @(mcid);
145 BOOL exist = [mcid_page_map.allKeys containsObject: key];
146 if (exist) {
147 NSString *str = mcid_page_map[key];
148 mcid_page_map[key] = [str stringByAppendingString: val];
149 }
150 else {
151 mcid_page_map[key] = val;
152 }
153 }
154 }
155}
156
157// Used in code snippet 3.
158NSString* ProcessStructElement2(PTSElement *element, NSMutableDictionary *mcid_doc_map, int ident)
159{
160 if (![element IsValid]) {
161 return @"";
162 }
163 NSString *result = @"";
164 // Print out the type and title info, if any.
165 result = [result stringByAppendingString: PrintIdent(ident)];
166 result = [result stringByAppendingFormat: @"<%@", [element GetType]];
167 if ([element HasTitle]) {
168 result = [result stringByAppendingFormat: @" title=\"%@\"", [element GetTitle]];
169 }
170 result = [result stringByAppendingString: @">"];
171
172 int num = [element GetNumKids];
173 int i;
174 for (i=0; i<num; ++i)
175 {
176 if ([element IsContentItem: i]) {
177 PTContentItem *cont = [element GetAsContentItem: i];
178 if ([cont GetType] == e_ptMCID) {
179 int page_num = [[cont GetPage] GetIndex];
180 id key = @(page_num);
181 BOOL exist = [mcid_doc_map.allKeys containsObject: key];
182
183 if (exist) {
184 NSMutableDictionary *mcid_page_map = mcid_doc_map[key];
185 id key2 = @([cont GetMCID]);
186 BOOL exist2 = [mcid_page_map.allKeys containsObject: key2];
187 if (exist2) {
188 NSString *str = mcid_page_map[key2];
189 result = [result stringByAppendingString: str];
190 }
191 }
192 }
193 }
194 else { // the kid is another StructElement node.
195 result = [result stringByAppendingString: ProcessStructElement2([element GetAsStructElem :i], mcid_doc_map, ident+1)];
196 }
197 }
198
199 result = [result stringByAppendingString: PrintIdent(ident)];
200 result = [result stringByAppendingFormat: @"</%@>", [element GetType]];
201 return result;
202}
203
204
205int main(int argc, char *argv[])
206{
207 @autoreleasepool {
208 int ret = 0;
209 [PTPDFNet Initialize: 0];
210
211 @try // Extract logical structure from a PDF document
212 {
213 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/tagged.pdf"];
214 [doc InitSecurityHandler];
215
216 NSLog(@"____________________________________________________________");
217 NSLog(@"Sample 1 - Traverse logical structure tree...");
218 {
219 PTSTree *tree = [doc GetStructTree];
220 if ([tree IsValid]) {
221 NSLog(@"Document has a StructTree root.");
222
223 int i;
224 for (i=0; i<[tree GetNumKids]; ++i) {
225 // Recursively get structure info for all child elements.
226 NSLog(@"%@", ProcessStructElement([tree GetKid: i], 0));
227 }
228 }
229 else {
230 NSLog(@"This document does not contain any logical structure.");
231 }
232 }
233 NSLog(@"Done 1.");
234
235 NSLog(@"____________________________________________________________");
236 NSLog(@"Sample 2 - Get parent logical structure elements from");
237 NSLog(@"layout elements.");
238 {
239 PTElementReader *reader = [[PTElementReader alloc] init];
240 PTPageIterator *itr;
241 for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
242 [reader Begin: [itr Current]];
243 NSLog(@"%@", ProcessLogicalStructureTestElements(reader));
244 [reader End];
245 }
246 }
247 NSLog(@"Done 2.");
248
249 NSLog(@"____________________________________________________________");
250 NSLog(@"Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
251 {
252 NSMutableDictionary *mcid_doc_map = [[NSMutableDictionary alloc] init];
253 PTElementReader *reader = [[PTElementReader alloc] init];
254 PTPageIterator *itr;
255 for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
256 [reader Begin: [itr Current]];
257 NSMutableDictionary *arr = [[NSMutableDictionary alloc] init];
258 id key = @([[itr Current] GetIndex]);
259 mcid_doc_map[key] = arr;
260 ProcessLogicalStructureTestElements2(reader, mcid_doc_map[key]);
261 [reader End];
262 }
263
264 PTSTree *tree = [doc GetStructTree];
265 if ([tree IsValid]) {
266 int i;
267 for (i=0; i<[tree GetNumKids]; ++i) {
268 NSLog(@"%@", ProcessStructElement2([tree GetKid: i], mcid_doc_map, 0));
269 }
270 }
271 }
272 NSLog(@"Done 3.");
273 [doc SaveToFile: @"../../TestFiles/Output/LogicalStructure.pdf" flags: e_ptlinearized];
274 }
275 @catch(NSException *e)
276 {
277 NSLog(@"%@", e.reason);
278 ret = 1;
279 }
280 [PTPDFNet Terminate: 0];
281 return ret;
282 }
283}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales