LogicalStructure

Sample Obj-C code for using Apryse SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our iOS SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9//---------------------------------------------------------------------------------------
10// This sample explores the structure and content of a tagged PDF document and dumps 
11// the structure information to the console window.
12//
13// In tagged PDF documents StructTree acts as a central repository for information 
14// related to a PDF document's logical structure. The tree consists of StructElement-s
15// and ContentItem-s which are leaf nodes of the structure tree.
16//
17// The sample can be extended to access and extract the marked-content elements such 
18// as text and images.
19//---------------------------------------------------------------------------------------
20
21
22NSString *PrintIdent(int ident) { 
23    int i;
24    NSString * str= @"\n";
25    for (i=0; i<ident; ++i) {
26        str = [str stringByAppendingString: @"  "]; 
27    }
28    return str;
29}
30
31// Used in code snippet 1.
32NSString* ProcessStructElement(PTSElement *element, int ident)
33{
34    if (![element IsValid]) {
35        return @"";
36    }
37
38    NSString *result = @"";
39    // Print out the type and title info, if any.
40    result = [result stringByAppendingFormat: @"%@Type: %@", PrintIdent(ident++), [element GetType]];
41    if ([element HasTitle]) {
42        result = [result stringByAppendingFormat: @". Title: %@", [element GetTitle]];
43    }
44
45    int num = [element GetNumKids];
46    int i;
47    for (i=0; i<num; ++i) 
48    {
49        // Check is the kid is a leaf node (i.e. it is a ContentItem).
50        if ([element IsContentItem: i]) { 
51            PTContentItem *cont = [element GetAsContentItem: i]; 
52            PTContentItemType type = [cont GetType];
53
54            PTPage *page = [cont GetPage];
55
56            result = [result stringByAppendingFormat: @"%@Content Item. Part of page #%d%@", PrintIdent(ident), [page GetIndex], PrintIdent(ident)];
57            
58            switch (type) {
59                case e_ptMCID:
60                case e_ptMCR:
61                    result = [result stringByAppendingFormat: @"MCID: %d", [cont GetMCID]];
62                    break;
63                case e_ptOBJR:
64                    {
65                        result = [result stringByAppendingString: @"OBJR "];
66                        PTObj *ref_obj;
67                        if ((ref_obj = [cont GetRefObj]) != NULL)
68                            result = [result stringByAppendingFormat: @"- Referenced Object#: %u", [ref_obj GetObjNum]];
69                    }
70                    break;
71                default: 
72                    break;
73            }
74        }
75        else {  // the kid is another StructElement node.
76            result = [result stringByAppendingString: ProcessStructElement([element GetAsStructElem: i], ident)];
77        }
78    }
79    return result;
80}
81
82// Used in code snippet 2.
83NSString* ProcessLogicalStructureTestElements(PTElementReader *reader)
84{
85    PTElement *element;
86    NSString *result = @"";
87    while ((element = [reader Next]) != NULL)     // Read page contents
88    {
89        // In this sample we process only paths & text, but the code can be 
90        // extended to handle any element type.
91        PTElementType type = [element GetType];
92        if (type == e_ptpath || type == e_pttext_obj || type == e_ptpath) 
93        {   
94            switch (type)    {
95            case e_ptpath:                // Process path ...
96                result = [result stringByAppendingString: @"\nPATH: "];
97                break; 
98            case e_pttext_obj:                 // Process text ...
99                result = [result stringByAppendingFormat: @"\nTEXT: %@\n", [element GetTextString]];
100                break;
101            case e_ptform:                // Process form XObjects
102                result = [result stringByAppendingString: @"\nFORM XObject: "];
103                //reader.FormBegin(); 
104                //ProcessLogicalStructureTestElements(reader);
105                //reader.End(); 
106                break;
107            default:
108                break;
109            }
110
111            // Check if the element is associated with any structural element.
112            // Content items are leaf nodes of the structure tree.
113            PTSElement *struct_parent = [element GetParentStructElement];
114            if ([struct_parent IsValid]) {
115                // Print out the parent structural element's type, title, and object number.
116                result = [result stringByAppendingFormat: @" Type: %@, MCID: %d", [struct_parent GetType], [element GetStructMCID]];
117                if ([struct_parent HasTitle]) {
118                    result = [result stringByAppendingFormat: @". Title: %@", [struct_parent GetTitle]];
119                }
120                result = [result stringByAppendingFormat: @", Obj#: %u", [[struct_parent GetSDFObj] GetObjNum]];
121            }
122        }
123    }
124    return result;
125}
126
127// Used in code snippet 3.
128//typedef map<int, string> MCIDPageMap;
129NSMutableDictionary *MCIDPageMap;
130NSMutableDictionary *MCIDDocMap;
131//typedef map<int, MCIDPageMap> MCIDDocMap;
132
133// Used in code snippet 3.
134void ProcessLogicalStructureTestElements2(PTElementReader *reader, NSMutableDictionary *mcid_page_map)
135{
136    PTElement *element;
137    while ((element = [reader Next]) != NULL) // Read page contents
138    {
139        // In this sample we process only text, but the code can be extended 
140        // to handle paths, images, or any other Element type.
141        int mcid = [element GetStructMCID];
142        if (mcid>= 0 && [element GetType] == e_pttext_obj) {
143            NSString *val = [element GetTextString];
144            id key = @(mcid);
145            BOOL exist = [mcid_page_map.allKeys containsObject: key];
146            if (exist) {
147                NSString *str = mcid_page_map[key];
148                mcid_page_map[key] = [str stringByAppendingString: val];
149            } 
150            else {
151                mcid_page_map[key] = val;
152            }
153        }
154    }
155}
156
157// Used in code snippet 3.
158NSString* ProcessStructElement2(PTSElement *element, NSMutableDictionary *mcid_doc_map, int ident) 
159{
160    if (![element IsValid]) {
161        return @"";
162    }
163    NSString *result = @"";
164    // Print out the type and title info, if any.
165    result = [result stringByAppendingString: PrintIdent(ident)];
166    result = [result stringByAppendingFormat: @"<%@", [element GetType]];
167    if ([element HasTitle]) {
168        result = [result stringByAppendingFormat: @" title=\"%@\"", [element GetTitle]];
169    }
170    result = [result stringByAppendingString: @">"];
171
172    int num = [element GetNumKids];
173    int i;
174    for (i=0; i<num; ++i) 
175    {        
176        if ([element IsContentItem: i]) { 
177            PTContentItem *cont = [element GetAsContentItem: i]; 
178            if ([cont GetType] == e_ptMCID) {
179                int page_num = [[cont GetPage] GetIndex];
180                id key = @(page_num);
181                BOOL exist = [mcid_doc_map.allKeys containsObject: key];
182                
183                if (exist) {
184                    NSMutableDictionary *mcid_page_map = mcid_doc_map[key];
185                    id key2 = @([cont GetMCID]);
186                    BOOL exist2 = [mcid_page_map.allKeys containsObject: key2];
187                    if (exist2) {
188                        NSString *str = mcid_page_map[key2];
189                        result = [result stringByAppendingString: str]; 
190                    }                    
191                }
192            }
193        }
194        else {  // the kid is another StructElement node.
195            result = [result stringByAppendingString: ProcessStructElement2([element GetAsStructElem :i], mcid_doc_map, ident+1)];
196        }
197    }
198
199    result = [result stringByAppendingString: PrintIdent(ident)];
200    result = [result stringByAppendingFormat: @"</%@>", [element GetType]];
201    return result;
202}
203
204
205int main(int argc, char *argv[])
206{
207    @autoreleasepool {
208        int ret = 0;
209        [PTPDFNet Initialize: 0];
210
211        @try    // Extract logical structure from a PDF document
212        {
213            PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/tagged.pdf"];
214            [doc InitSecurityHandler];
215
216            NSLog(@"____________________________________________________________");
217            NSLog(@"Sample 1 - Traverse logical structure tree...");
218            {
219                PTSTree *tree = [doc GetStructTree];
220                if ([tree IsValid]) {
221                    NSLog(@"Document has a StructTree root.");
222
223                int i;
224                    for (i=0; i<[tree GetNumKids]; ++i) {
225                        // Recursively get structure info for all child elements.
226                        NSLog(@"%@", ProcessStructElement([tree GetKid: i], 0));
227                    }
228                }
229                else {
230                    NSLog(@"This document does not contain any logical structure.");
231                }
232            }
233            NSLog(@"Done 1.");
234
235            NSLog(@"____________________________________________________________");
236            NSLog(@"Sample 2 - Get parent logical structure elements from");
237            NSLog(@"layout elements.");
238            {
239                PTElementReader *reader = [[PTElementReader alloc] init];
240            PTPageIterator *itr;
241                for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
242                    [reader Begin: [itr Current]];
243                    NSLog(@"%@", ProcessLogicalStructureTestElements(reader));
244                    [reader End];
245                }
246            }
247            NSLog(@"Done 2.");
248
249            NSLog(@"____________________________________________________________");
250            NSLog(@"Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
251            {
252                NSMutableDictionary *mcid_doc_map = [[NSMutableDictionary alloc] init];
253                PTElementReader *reader = [[PTElementReader alloc] init];
254                PTPageIterator *itr;
255                for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
256                    [reader Begin: [itr Current]];
257                    NSMutableDictionary *arr = [[NSMutableDictionary alloc] init];
258                id key = @([[itr Current] GetIndex]);
259                mcid_doc_map[key] = arr;
260                    ProcessLogicalStructureTestElements2(reader, mcid_doc_map[key]);
261                    [reader End];
262                }
263
264                PTSTree *tree = [doc GetStructTree];
265                if ([tree IsValid]) {
266                    int i;
267                    for (i=0; i<[tree GetNumKids]; ++i) {
268                        NSLog(@"%@", ProcessStructElement2([tree GetKid: i], mcid_doc_map, 0));
269                    }
270                }
271            }
272            NSLog(@"Done 3.");
273            [doc SaveToFile: @"../../TestFiles/Output/LogicalStructure.pdf" flags: e_ptlinearized];
274        }
275        @catch(NSException *e) 
276        {
277            NSLog(@"%@", e.reason);
278            ret = 1;
279        }
280        [PTPDFNet Terminate: 0];
281        return ret;
282    }
283}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import PDFNet
7import Foundation
8
9//---------------------------------------------------------------------------------------
10// This sample explores the structure and content of a tagged PDF document and dumps
11// the structure information to the console window.
12//
13// In tagged PDF documents StructTree acts as a central repository for information
14// related to a PDF document's logical structure. The tree consists of StructElement-s
15// and ContentItem-s which are leaf nodes of the structure tree.
16//
17// The sample can be extended to access and extract the marked-content elements such
18// as text and images.
19//---------------------------------------------------------------------------------------
20
21func PrintIndent(_ indent: Int) -> String {
22    var str = "\n"
23    for _ in 0..<indent {
24        str += "  "
25    }
26    return str
27}
28
29// Used in code snippet 1.
30func ProcessStructElement(element: PTSElement, indent: Int) -> String {
31    if !element.isValid() {
32        return ""
33    }
34    
35    // Print out the type and title info, if any.
36    var result = ("\(PrintIndent(indent))Type: \(String(describing: element.getType()))")
37    let nestedIndent = indent + 1
38    if element.hasTitle() {
39        result = result + (". Title: \(String(describing: element.getTitle()))")
40    }
41    
42    let num = element.getNumKids()
43    for i in 0..<num {
44        // Check if the kid is a leaf node (i.e. it is a ContentItem).
45        if element.isContentItem(i) {
46            let cont: PTContentItem = element.getAsContentItem(i)
47            let type: PTContentItemType = cont.getType()
48            
49            let page: PTPage = cont.getPage()
50            result += ("\(PrintIndent(nestedIndent))Content Item. Part of page #\(page.getIndex())\(PrintIndent(nestedIndent))")
51            switch type {
52            case e_ptMCID, e_ptMCR:
53                result += ("MCID: \(cont.getMCID())")
54            case e_ptOBJR:
55                result += ("OBJR ")
56                if let ref_obj = cont.getRefObj() {
57                    result += ("- Referenced Object#: \(ref_obj.getNum())")
58                }
59            default:
60                break
61            }
62        }
63        else {
64            // the kid is another StructElement node.
65            result = result + (ProcessStructElement(element: element.getAsStructElem(i), indent: nestedIndent))
66        }
67    }
68    return result
69}
70
71// Used in code snippet 2.
72func ProcessLogicalStructureTestElements(reader: PTElementReader) -> String {
73    var result = ""
74    while let element = reader.next() { // Read page contents
75        // In this sample we process only paths & text, but the code can be
76        // extended to handle any element type.
77        let type: PTElementType = element.getType()
78        if type == e_ptpath || type == e_pttext_obj || type == e_ptpath {
79            switch type {
80            case e_ptpath:  // Process path ...
81                result = result + ("\nPATH: ")
82            case e_pttext_obj:  // Process text ...
83                result = result + ("\nTEXT: \(String(describing: element.getTextString()))")
84            case e_ptform:  // Process form XObjects
85                result = result + ("\nFORM XObject:")
86                //reader.FormBegin();
87                //ProcessLogicalStructureTestElements(reader);
88                //reader.End();
89            default:
90                break
91            }
92            
93            // Check if the element is associated with any structural element.
94            // Content items are leaf nodes of the structure tree.
95            let struct_parent: PTSElement = element.getParentStructElement()
96            if struct_parent.isValid() {
97                // Print out the parent structural element's type, title, and object number.
98                result = result + (" Type: \(String(describing: struct_parent.getType())), MCID: \(element.getStructMCID())")
99                if struct_parent.hasTitle() {
100                    result = result + (". Title: \(String(describing: struct_parent.getTitle()))")
101                }
102                result = result + (", Obj#: \(struct_parent.getSDFObj().getNum())")
103            }
104        }
105    }
106    return result
107}
108
109
110// Used in code snippet 3.
111//typedef map<int, string> MCIDPageMap;
112//var MCIDPageMap = [AnyHashable: Any]()
113//var MCIDDocMap = [AnyHashable: Any]()
114//typedef map<int, MCIDPageMap> MCIDDocMap;
115
116// Used in code snippet 3.
117func ProcessLogicalStructureTestElements2(reader: PTElementReader, mcid_page_map: NSMutableDictionary) {
118    while let element = reader.next() {    // Read page contents
119        // In this sample we process only text, but the code can be extended
120        // to handle paths, images, or any other Element type.
121        let mcid = element.getStructMCID()
122        if mcid >= 0 && element.getType() == e_pttext_obj {
123            let val = element.getTextString()
124            let key = mcid
125            if let str = mcid_page_map[key] as? String {
126                mcid_page_map[key] = str + (val ?? "")
127            } else {
128                mcid_page_map[key] = val ?? ""
129            }
130        }
131    }
132}
133
134// Used in code snippet 3.
135func ProcessStructElement2(element: PTSElement, mcid_doc_map: NSMutableDictionary, indent: Int) -> String {
136    if !element.isValid() {
137        return ""
138    }
139    var result = ""
140    // Print out the type and title info, if any.
141    result += (PrintIndent(indent))
142    result += ("<\(String(describing: element.getType()))")
143    if element.hasTitle() {
144        result += (" title=\"\(String(describing: element.getTitle()))\"")
145    }
146    result += (">")
147    
148    let num = element.getNumKids()
149    for i in 0..<num {
150        if element.isContentItem(i) {
151            let cont: PTContentItem = element.getAsContentItem(i)
152            if cont.getType() == e_ptMCID {
153                let page_num = cont.getPage().getIndex()
154                let key = page_num
155                if let mcid_page_map = mcid_doc_map[key] as? NSMutableDictionary {
156                    let key2 = cont.getMCID()
157                    if let str = mcid_page_map[key2] as? String {
158                        result += (str)
159                    }
160                }
161            }
162        }
163        else {  // the kid is another StructElement node.
164            result += (ProcessStructElement2(element: element.getAsStructElem(i), mcid_doc_map: mcid_doc_map, indent: indent + 1))
165        }
166    }
167    
168    result += (PrintIndent(indent))
169    result += ("</\(String(describing: element.getType()))>")
170    return result
171}
172
173func runLogicalStructureTest() -> Int {
174    return autoreleasepool {
175        var ret: Int = 0
176        
177        
178        do {
179            // Extract logical structure from a PDF document
180            try PTPDFNet.catchException {
181                let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "tagged", ofType: "pdf"))
182                doc.initSecurityHandler()
183                
184                print("____________________________________________________________")
185                print("Sample 1 - Traverse logical structure tree...")
186                do {
187                    let tree: PTSTree = doc.getStructTree()
188                    if tree.isValid() {
189                        print("Document has a StructTree root.")
190                
191                        for i in 0..<tree.getNumKids() {
192                            // Recursively get structure info for all child elements.
193                            print("\(ProcessStructElement(element: tree.getKid(i), indent: 0))")
194                        }
195                    }
196                    else {
197                        print("This document does not contain any logical structure.")
198                    }
199                }
200                print("Done 1.")
201
202                print("____________________________________________________________")
203                print("Sample 2 - Get parent logical structure elements from")
204                print("layout elements.")
205                do {
206                    let reader: PTElementReader = PTElementReader()
207                    let itr: PTPageIterator = doc.getPageIterator(1)
208                    while itr.hasNext() {
209                        reader.begin(itr.current())
210                        print("\(ProcessLogicalStructureTestElements(reader: reader))")
211                        reader.end()
212                        itr.next()
213                    }
214                }
215                print("Done 2.")
216                
217                print("____________________________________________________________")
218                print("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
219                do {
220                    let mcid_doc_map = NSMutableDictionary()
221                    let reader: PTElementReader = PTElementReader()
222                    let itr: PTPageIterator = doc.getPageIterator(1)
223                    while itr.hasNext() {
224                        reader.begin(itr.current())
225                        let arr = NSMutableDictionary()
226                        ProcessLogicalStructureTestElements2(reader: reader, mcid_page_map: arr)
227                        let key = itr.current().getIndex()
228                        mcid_doc_map[key] = arr
229                        reader.end()
230                        itr.next()
231                    }
232                    
233                    let tree: PTSTree = doc.getStructTree()
234                    if tree.isValid() {
235                        for i in 0..<tree.getNumKids() {
236                            print("\(ProcessStructElement2(element: tree.getKid(i), mcid_doc_map: mcid_doc_map, indent: 0))")
237                        }
238                    }
239                }
240                print("Done 3.")
241            }
242        } catch let e as NSError {
243            print("Caught PDFNet exception: \(e)")
244            ret = 1
245        }
246        return ret
247    }
248}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

LogicalStructure