Sample Obj-C code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our iOS SDK and PDF Data Extraction SDK Capabilities.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9//---------------------------------------------------------------------------------------
10// This sample explores the structure and content of a tagged PDF document and dumps
11// the structure information to the console window.
12//
13// In tagged PDF documents StructTree acts as a central repository for information
14// related to a PDF document's logical structure. The tree consists of StructElement-s
15// and ContentItem-s which are leaf nodes of the structure tree.
16//
17// The sample can be extended to access and extract the marked-content elements such
18// as text and images.
19//---------------------------------------------------------------------------------------
20
21
22NSString *PrintIdent(int ident) {
23 int i;
24 NSString * str= @"\n";
25 for (i=0; i<ident; ++i) {
26 str = [str stringByAppendingString: @" "];
27 }
28 return str;
29}
30
31// Used in code snippet 1.
32NSString* ProcessStructElement(PTSElement *element, int ident)
33{
34 if (![element IsValid]) {
35 return @"";
36 }
37
38 NSString *result = @"";
39 // Print out the type and title info, if any.
40 result = [result stringByAppendingFormat: @"%@Type: %@", PrintIdent(ident++), [element GetType]];
41 if ([element HasTitle]) {
42 result = [result stringByAppendingFormat: @". Title: %@", [element GetTitle]];
43 }
44
45 int num = [element GetNumKids];
46 int i;
47 for (i=0; i<num; ++i)
48 {
49 // Check is the kid is a leaf node (i.e. it is a ContentItem).
50 if ([element IsContentItem: i]) {
51 PTContentItem *cont = [element GetAsContentItem: i];
52 PTContentItemType type = [cont GetType];
53
54 PTPage *page = [cont GetPage];
55
56 result = [result stringByAppendingFormat: @"%@Content Item. Part of page #%d%@", PrintIdent(ident), [page GetIndex], PrintIdent(ident)];
57
58 switch (type) {
59 case e_ptMCID:
60 case e_ptMCR:
61 result = [result stringByAppendingFormat: @"MCID: %d", [cont GetMCID]];
62 break;
63 case e_ptOBJR:
64 {
65 result = [result stringByAppendingString: @"OBJR "];
66 PTObj *ref_obj;
67 if ((ref_obj = [cont GetRefObj]) != NULL)
68 result = [result stringByAppendingFormat: @"- Referenced Object#: %u", [ref_obj GetObjNum]];
69 }
70 break;
71 default:
72 break;
73 }
74 }
75 else { // the kid is another StructElement node.
76 result = [result stringByAppendingString: ProcessStructElement([element GetAsStructElem: i], ident)];
77 }
78 }
79 return result;
80}
81
82// Used in code snippet 2.
83NSString* ProcessLogicalStructureTestElements(PTElementReader *reader)
84{
85 PTElement *element;
86 NSString *result = @"";
87 while ((element = [reader Next]) != NULL) // Read page contents
88 {
89 // In this sample we process only paths & text, but the code can be
90 // extended to handle any element type.
91 PTElementType type = [element GetType];
92 if (type == e_ptpath || type == e_pttext_obj || type == e_ptpath)
93 {
94 switch (type) {
95 case e_ptpath: // Process path ...
96 result = [result stringByAppendingString: @"\nPATH: "];
97 break;
98 case e_pttext_obj: // Process text ...
99 result = [result stringByAppendingFormat: @"\nTEXT: %@\n", [element GetTextString]];
100 break;
101 case e_ptform: // Process form XObjects
102 result = [result stringByAppendingString: @"\nFORM XObject: "];
103 //reader.FormBegin();
104 //ProcessLogicalStructureTestElements(reader);
105 //reader.End();
106 break;
107 default:
108 break;
109 }
110
111 // Check if the element is associated with any structural element.
112 // Content items are leaf nodes of the structure tree.
113 PTSElement *struct_parent = [element GetParentStructElement];
114 if ([struct_parent IsValid]) {
115 // Print out the parent structural element's type, title, and object number.
116 result = [result stringByAppendingFormat: @" Type: %@, MCID: %d", [struct_parent GetType], [element GetStructMCID]];
117 if ([struct_parent HasTitle]) {
118 result = [result stringByAppendingFormat: @". Title: %@", [struct_parent GetTitle]];
119 }
120 result = [result stringByAppendingFormat: @", Obj#: %u", [[struct_parent GetSDFObj] GetObjNum]];
121 }
122 }
123 }
124 return result;
125}
126
127// Used in code snippet 3.
128//typedef map<int, string> MCIDPageMap;
129NSMutableDictionary *MCIDPageMap;
130NSMutableDictionary *MCIDDocMap;
131//typedef map<int, MCIDPageMap> MCIDDocMap;
132
133// Used in code snippet 3.
134void ProcessLogicalStructureTestElements2(PTElementReader *reader, NSMutableDictionary *mcid_page_map)
135{
136 PTElement *element;
137 while ((element = [reader Next]) != NULL) // Read page contents
138 {
139 // In this sample we process only text, but the code can be extended
140 // to handle paths, images, or any other Element type.
141 int mcid = [element GetStructMCID];
142 if (mcid>= 0 && [element GetType] == e_pttext_obj) {
143 NSString *val = [element GetTextString];
144 id key = @(mcid);
145 BOOL exist = [mcid_page_map.allKeys containsObject: key];
146 if (exist) {
147 NSString *str = mcid_page_map[key];
148 mcid_page_map[key] = [str stringByAppendingString: val];
149 }
150 else {
151 mcid_page_map[key] = val;
152 }
153 }
154 }
155}
156
157// Used in code snippet 3.
158NSString* ProcessStructElement2(PTSElement *element, NSMutableDictionary *mcid_doc_map, int ident)
159{
160 if (![element IsValid]) {
161 return @"";
162 }
163 NSString *result = @"";
164 // Print out the type and title info, if any.
165 result = [result stringByAppendingString: PrintIdent(ident)];
166 result = [result stringByAppendingFormat: @"<%@", [element GetType]];
167 if ([element HasTitle]) {
168 result = [result stringByAppendingFormat: @" title=\"%@\"", [element GetTitle]];
169 }
170 result = [result stringByAppendingString: @">"];
171
172 int num = [element GetNumKids];
173 int i;
174 for (i=0; i<num; ++i)
175 {
176 if ([element IsContentItem: i]) {
177 PTContentItem *cont = [element GetAsContentItem: i];
178 if ([cont GetType] == e_ptMCID) {
179 int page_num = [[cont GetPage] GetIndex];
180 id key = @(page_num);
181 BOOL exist = [mcid_doc_map.allKeys containsObject: key];
182
183 if (exist) {
184 NSMutableDictionary *mcid_page_map = mcid_doc_map[key];
185 id key2 = @([cont GetMCID]);
186 BOOL exist2 = [mcid_page_map.allKeys containsObject: key2];
187 if (exist2) {
188 NSString *str = mcid_page_map[key2];
189 result = [result stringByAppendingString: str];
190 }
191 }
192 }
193 }
194 else { // the kid is another StructElement node.
195 result = [result stringByAppendingString: ProcessStructElement2([element GetAsStructElem :i], mcid_doc_map, ident+1)];
196 }
197 }
198
199 result = [result stringByAppendingString: PrintIdent(ident)];
200 result = [result stringByAppendingFormat: @"</%@>", [element GetType]];
201 return result;
202}
203
204
205int main(int argc, char *argv[])
206{
207 @autoreleasepool {
208 int ret = 0;
209 [PTPDFNet Initialize: 0];
210
211 @try // Extract logical structure from a PDF document
212 {
213 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/tagged.pdf"];
214 [doc InitSecurityHandler];
215
216 NSLog(@"____________________________________________________________");
217 NSLog(@"Sample 1 - Traverse logical structure tree...");
218 {
219 PTSTree *tree = [doc GetStructTree];
220 if ([tree IsValid]) {
221 NSLog(@"Document has a StructTree root.");
222
223 int i;
224 for (i=0; i<[tree GetNumKids]; ++i) {
225 // Recursively get structure info for all child elements.
226 NSLog(@"%@", ProcessStructElement([tree GetKid: i], 0));
227 }
228 }
229 else {
230 NSLog(@"This document does not contain any logical structure.");
231 }
232 }
233 NSLog(@"Done 1.");
234
235 NSLog(@"____________________________________________________________");
236 NSLog(@"Sample 2 - Get parent logical structure elements from");
237 NSLog(@"layout elements.");
238 {
239 PTElementReader *reader = [[PTElementReader alloc] init];
240 PTPageIterator *itr;
241 for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
242 [reader Begin: [itr Current]];
243 NSLog(@"%@", ProcessLogicalStructureTestElements(reader));
244 [reader End];
245 }
246 }
247 NSLog(@"Done 2.");
248
249 NSLog(@"____________________________________________________________");
250 NSLog(@"Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
251 {
252 NSMutableDictionary *mcid_doc_map = [[NSMutableDictionary alloc] init];
253 PTElementReader *reader = [[PTElementReader alloc] init];
254 PTPageIterator *itr;
255 for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
256 [reader Begin: [itr Current]];
257 NSMutableDictionary *arr = [[NSMutableDictionary alloc] init];
258 id key = @([[itr Current] GetIndex]);
259 mcid_doc_map[key] = arr;
260 ProcessLogicalStructureTestElements2(reader, mcid_doc_map[key]);
261 [reader End];
262 }
263
264 PTSTree *tree = [doc GetStructTree];
265 if ([tree IsValid]) {
266 int i;
267 for (i=0; i<[tree GetNumKids]; ++i) {
268 NSLog(@"%@", ProcessStructElement2([tree GetKid: i], mcid_doc_map, 0));
269 }
270 }
271 }
272 NSLog(@"Done 3.");
273 [doc SaveToFile: @"../../TestFiles/Output/LogicalStructure.pdf" flags: e_ptlinearized];
274 }
275 @catch(NSException *e)
276 {
277 NSLog(@"%@", e.reason);
278 ret = 1;
279 }
280 [PTPDFNet Terminate: 0];
281 return ret;
282 }
283}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import PDFNet
7import Foundation
8
9//---------------------------------------------------------------------------------------
10// This sample explores the structure and content of a tagged PDF document and dumps
11// the structure information to the console window.
12//
13// In tagged PDF documents StructTree acts as a central repository for information
14// related to a PDF document's logical structure. The tree consists of StructElement-s
15// and ContentItem-s which are leaf nodes of the structure tree.
16//
17// The sample can be extended to access and extract the marked-content elements such
18// as text and images.
19//---------------------------------------------------------------------------------------
20
21func PrintIndent(_ indent: Int) -> String {
22 var str = "\n"
23 for _ in 0..<indent {
24 str += " "
25 }
26 return str
27}
28
29// Used in code snippet 1.
30func ProcessStructElement(element: PTSElement, indent: Int) -> String {
31 if !element.isValid() {
32 return ""
33 }
34
35 // Print out the type and title info, if any.
36 var result = ("\(PrintIndent(indent))Type: \(String(describing: element.getType()))")
37 let nestedIndent = indent + 1
38 if element.hasTitle() {
39 result = result + (". Title: \(String(describing: element.getTitle()))")
40 }
41
42 let num = element.getNumKids()
43 for i in 0..<num {
44 // Check if the kid is a leaf node (i.e. it is a ContentItem).
45 if element.isContentItem(i) {
46 let cont: PTContentItem = element.getAsContentItem(i)
47 let type: PTContentItemType = cont.getType()
48
49 let page: PTPage = cont.getPage()
50 result += ("\(PrintIndent(nestedIndent))Content Item. Part of page #\(page.getIndex())\(PrintIndent(nestedIndent))")
51 switch type {
52 case e_ptMCID, e_ptMCR:
53 result += ("MCID: \(cont.getMCID())")
54 case e_ptOBJR:
55 result += ("OBJR ")
56 if let ref_obj = cont.getRefObj() {
57 result += ("- Referenced Object#: \(ref_obj.getNum())")
58 }
59 default:
60 break
61 }
62 }
63 else {
64 // the kid is another StructElement node.
65 result = result + (ProcessStructElement(element: element.getAsStructElem(i), indent: nestedIndent))
66 }
67 }
68 return result
69}
70
71// Used in code snippet 2.
72func ProcessLogicalStructureTestElements(reader: PTElementReader) -> String {
73 var result = ""
74 while let element = reader.next() { // Read page contents
75 // In this sample we process only paths & text, but the code can be
76 // extended to handle any element type.
77 let type: PTElementType = element.getType()
78 if type == e_ptpath || type == e_pttext_obj || type == e_ptpath {
79 switch type {
80 case e_ptpath: // Process path ...
81 result = result + ("\nPATH: ")
82 case e_pttext_obj: // Process text ...
83 result = result + ("\nTEXT: \(String(describing: element.getTextString()))")
84 case e_ptform: // Process form XObjects
85 result = result + ("\nFORM XObject:")
86 //reader.FormBegin();
87 //ProcessLogicalStructureTestElements(reader);
88 //reader.End();
89 default:
90 break
91 }
92
93 // Check if the element is associated with any structural element.
94 // Content items are leaf nodes of the structure tree.
95 let struct_parent: PTSElement = element.getParentStructElement()
96 if struct_parent.isValid() {
97 // Print out the parent structural element's type, title, and object number.
98 result = result + (" Type: \(String(describing: struct_parent.getType())), MCID: \(element.getStructMCID())")
99 if struct_parent.hasTitle() {
100 result = result + (". Title: \(String(describing: struct_parent.getTitle()))")
101 }
102 result = result + (", Obj#: \(struct_parent.getSDFObj().getNum())")
103 }
104 }
105 }
106 return result
107}
108
109
110// Used in code snippet 3.
111//typedef map<int, string> MCIDPageMap;
112//var MCIDPageMap = [AnyHashable: Any]()
113//var MCIDDocMap = [AnyHashable: Any]()
114//typedef map<int, MCIDPageMap> MCIDDocMap;
115
116// Used in code snippet 3.
117func ProcessLogicalStructureTestElements2(reader: PTElementReader, mcid_page_map: NSMutableDictionary) {
118 while let element = reader.next() { // Read page contents
119 // In this sample we process only text, but the code can be extended
120 // to handle paths, images, or any other Element type.
121 let mcid = element.getStructMCID()
122 if mcid >= 0 && element.getType() == e_pttext_obj {
123 let val = element.getTextString()
124 let key = mcid
125 if let str = mcid_page_map[key] as? String {
126 mcid_page_map[key] = str + (val ?? "")
127 } else {
128 mcid_page_map[key] = val ?? ""
129 }
130 }
131 }
132}
133
134// Used in code snippet 3.
135func ProcessStructElement2(element: PTSElement, mcid_doc_map: NSMutableDictionary, indent: Int) -> String {
136 if !element.isValid() {
137 return ""
138 }
139 var result = ""
140 // Print out the type and title info, if any.
141 result += (PrintIndent(indent))
142 result += ("<\(String(describing: element.getType()))")
143 if element.hasTitle() {
144 result += (" title=\"\(String(describing: element.getTitle()))\"")
145 }
146 result += (">")
147
148 let num = element.getNumKids()
149 for i in 0..<num {
150 if element.isContentItem(i) {
151 let cont: PTContentItem = element.getAsContentItem(i)
152 if cont.getType() == e_ptMCID {
153 let page_num = cont.getPage().getIndex()
154 let key = page_num
155 if let mcid_page_map = mcid_doc_map[key] as? NSMutableDictionary {
156 let key2 = cont.getMCID()
157 if let str = mcid_page_map[key2] as? String {
158 result += (str)
159 }
160 }
161 }
162 }
163 else { // the kid is another StructElement node.
164 result += (ProcessStructElement2(element: element.getAsStructElem(i), mcid_doc_map: mcid_doc_map, indent: indent + 1))
165 }
166 }
167
168 result += (PrintIndent(indent))
169 result += ("</\(String(describing: element.getType()))>")
170 return result
171}
172
173func runLogicalStructureTest() -> Int {
174 return autoreleasepool {
175 var ret: Int = 0
176
177
178 do {
179 // Extract logical structure from a PDF document
180 try PTPDFNet.catchException {
181 let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "tagged", ofType: "pdf"))
182 doc.initSecurityHandler()
183
184 print("____________________________________________________________")
185 print("Sample 1 - Traverse logical structure tree...")
186 do {
187 let tree: PTSTree = doc.getStructTree()
188 if tree.isValid() {
189 print("Document has a StructTree root.")
190
191 for i in 0..<tree.getNumKids() {
192 // Recursively get structure info for all child elements.
193 print("\(ProcessStructElement(element: tree.getKid(i), indent: 0))")
194 }
195 }
196 else {
197 print("This document does not contain any logical structure.")
198 }
199 }
200 print("Done 1.")
201
202 print("____________________________________________________________")
203 print("Sample 2 - Get parent logical structure elements from")
204 print("layout elements.")
205 do {
206 let reader: PTElementReader = PTElementReader()
207 let itr: PTPageIterator = doc.getPageIterator(1)
208 while itr.hasNext() {
209 reader.begin(itr.current())
210 print("\(ProcessLogicalStructureTestElements(reader: reader))")
211 reader.end()
212 itr.next()
213 }
214 }
215 print("Done 2.")
216
217 print("____________________________________________________________")
218 print("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
219 do {
220 let mcid_doc_map = NSMutableDictionary()
221 let reader: PTElementReader = PTElementReader()
222 let itr: PTPageIterator = doc.getPageIterator(1)
223 while itr.hasNext() {
224 reader.begin(itr.current())
225 let arr = NSMutableDictionary()
226 ProcessLogicalStructureTestElements2(reader: reader, mcid_page_map: arr)
227 let key = itr.current().getIndex()
228 mcid_doc_map[key] = arr
229 reader.end()
230 itr.next()
231 }
232
233 let tree: PTSTree = doc.getStructTree()
234 if tree.isValid() {
235 for i in 0..<tree.getNumKids() {
236 print("\(ProcessStructElement2(element: tree.getKid(i), mcid_doc_map: mcid_doc_map, indent: 0))")
237 }
238 }
239 }
240 print("Done 3.")
241 }
242 } catch let e as NSError {
243 print("Caught PDFNet exception: \(e)")
244 ret = 1
245 }
246 return ret
247 }
248}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales