TextExtract

Sample code in Swift and Obj-C for using Apryse iOS SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with iOS SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9// This sample illustrates the basic text extraction capabilities of PDFNet.
10
11// A utility method used to dump all text content in the console window.
12void DumpAllText(PTElementReader *reader) 
13{
14    PTElement *element; 
15    while ((element = [reader Next]) != NULL)
16    {
17        switch ([element GetType]) 
18        {
19        case e_pttext_begin: 
20            NSLog(@"--> Text Block Begin");
21            break;
22        case e_pttext_end:
23            NSLog(@"--> Text Block End");
24            break;
25        case e_pttext:
26            {
27                PTPDFRect * bbox = [element GetBBox];
28                NSLog(@"--> BBox: %f, %f, %f, %f", [bbox GetX1], [bbox GetY1], [bbox GetX2], [bbox GetY2]);
29
30                NSLog(@"%@", [element GetTextString]);
31            }
32            break;
33        case e_pttext_new_line:
34            NSLog(@"--> New Line");
35            break;
36        case e_ptform:                // Process form XObjects
37            [reader FormBegin]; 
38            DumpAllText(reader);
39            [reader End]; 
40            break;
41        default:
42            break;
43        }
44    }
45}
46
47// A helper method for ReadTextFromRect
48void RectTextSearch(PTElementReader *reader, PTPDFRect * pos, NSString **srch_str)
49{            
50    PTElement *element; 
51    while ((element = [reader Next]) != NULL)
52    {
53        switch ([element GetType]) 
54        {
55        case e_pttext:
56            {
57                PTPDFRect * bbox = [element GetBBox];
58                if([bbox IntersectRect: bbox rect2: pos]) 
59                {
60                    NSString *arr = [element GetTextString];
61                    *srch_str = [*srch_str stringByAppendingString: arr];
62                    *srch_str = [*srch_str stringByAppendingString: @"\n"]; // add a new line?
63                }
64                break;
65            }
66        case e_pttext_new_line:
67            {
68                break;
69            }
70        case e_ptform: // Process form XObjects
71            {
72                [reader FormBegin]; 
73                RectTextSearch(reader, pos, &(*srch_str));
74                [reader End]; 
75                break; 
76            }
77        default:
78            break;
79        }
80    }
81}
82
83// A utility method used to extract all text content from
84// a given selection rectangle. The rectangle coordinates are
85// expressed in PDF user/page coordinate system.
86NSString* ReadTextFromRect(PTPage *page, PTPDFRect * pos, PTElementReader *reader)
87{
88    NSString *srch_str = @"";
89    [reader Begin: page];
90    RectTextSearch(reader, pos, &srch_str);
91    [reader End];
92    return srch_str;
93}
94
95
96void PrintStyle(PTTextExtractorStyle *s)
97{
98    NSArray *rgb = [s GetColor];
99    NSString * name = [s GetFontName ];
100    const char* font_family = [name UTF8String];
101    double font_size = [s GetFontSize];
102    const char * san_serif = ([s IsSerif]) ? " sans-serif;" : "";
103    int R = [rgb[0] intValue];
104    int G = [rgb[1] intValue];
105    int B = [rgb[2] intValue];
106    printf(" style=\"font-family:%s; font-size:%g;%s color:#%02X%02X%02X;\"", font_family, font_size, san_serif, [rgb[0] intValue], [rgb[1] intValue], [rgb[2] intValue]);
107}
108
109int main(int argc, char *argv[])
110{
111    @autoreleasepool {
112        int ret = 0;
113        [PTPDFNet Initialize: 0];
114
115        bool example1_basic     = false;
116        bool example2_xml       = false;
117        bool example3_wordlist  = false;
118        bool example4_advanced  = true;
119        bool example5_low_level = false;
120
121        // Sample code showing how to use high-level text extraction APIs.
122        @try
123        {
124            PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
125            [doc InitSecurityHandler];
126
127            PTPage *page = [doc GetPage: 1];
128            if (![page IsValid]){
129                NSLog(@"Page not found.");
130                return 1;
131            }
132
133            PTTextExtractor *txt = [[PTTextExtractor alloc] init];
134            [txt Begin: page clip_ptr: 0 flags: 0]; // Read the page.
135            // Other options you may want to consider...
136            // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
137            // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
138
139
140            // Example 1. Get all text on the page in a single string.
141            // Words will be separated with space or new line characters.
142            if (example1_basic) 
143            {
144                // Get the word count.
145                NSLog(@"Word Count: %d", [txt GetWordCount]);
146
147                NSString *text = [txt GetAsText: YES];
148                NSLog(@"\n\n- GetAsText --------------------------\n%@", text);
149                NSLog(@"-----------------------------------------------------------");
150            }
151
152            // Example 2. Get XML logical structure for the page.
153            if (example2_xml) 
154            {
155                NSString *text = [txt GetAsXML: e_ptwords_as_elements | e_ptoutput_bbox | e_ptoutput_style_info];
156                NSLog(@"\n\n- GetAsXML  --------------------------\n %@", text);
157                NSLog(@"-----------------------------------------------------------");
158            }
159
160            // Example 3. Extract words one by one.
161            if (example3_wordlist) 
162            {
163                PTTextExtractorLine *line = [txt GetFirstLine];
164                PTWord *word;
165                for (; [line IsValid]; line=[line GetNextLine])    {
166                    for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord]) {
167                        NSLog(@"%@", [word GetString]);
168                    }
169                }
170                NSLog(@"-----------------------------------------------------------");
171            }
172
173            // Example 4. A more advanced text extraction example. 
174            // The output is XML structure containing paragraphs, lines, words, 
175            // as well as style and positioning information.
176            if (example4_advanced) 
177            {
178                PTPDFRect * b, *q;
179                int cur_flow_id=-1, cur_para_id=-1;
180
181                NSString *uni_str;
182                PTTextExtractorLine *line;
183                PTWord *word;
184                PTTextExtractorStyle *s, *line_style;
185
186                printf("<PDFText>\n");
187                // For each line on the page...
188                for (line=[txt GetFirstLine]; [line IsValid]; line=[line GetNextLine])
189                {
190                    if ( [line GetNumWords] == 0 )
191                    {
192                        continue;
193                    }
194                    
195                    if (cur_flow_id != [line GetFlowID]) {
196                        if (cur_flow_id != -1) {
197                            if (cur_para_id != -1) {
198                                cur_para_id = -1;
199                                printf("</Para>\n");
200                            }
201                            printf("</Flow>\n");
202                        }
203                        cur_flow_id = [line GetFlowID];
204                        printf("<Flow id=\"%d\">\n", cur_flow_id);
205                    }
206
207                    if (cur_para_id != [line GetParagraphID]) {
208                        if (cur_para_id != -1)
209                            printf("</Para>\n");
210                        cur_para_id = [line GetParagraphID];
211                        printf("<Para id=\"%d\">\n", cur_para_id);
212                    }    
213
214                    b = [line GetBBox];
215                    line_style = [line GetStyle];
216                    printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", [b GetX1], [b GetY1], [b GetX2], [b GetY2]);
217                    PrintStyle(line_style);
218                    printf(" cur_num=\"%d\"", [line GetCurrentNum]);
219                    printf(">\n");
220
221                    // For each word in the line...
222                    for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord])
223                    {
224                        // Output the bounding box for the word.
225                        q = [word GetBBox];
226                        printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", [q GetX1], [q GetY1], [q GetX2], [q GetY2]);
227                        printf(" cur_num=\"%d\"", [word GetCurrentNum]);
228                        
229                        int sz = [word GetStringLen];
230                        if (sz == 0) continue;
231
232                        // If the word style is different from the parent style, output the new style.
233                    
234                        s = [word GetStyle];    
235                        if(![s isEqualTo:line_style]){
236                            PrintStyle(s);
237                        }
238
239                        uni_str = [word GetString];
240                        printf(">%s", [uni_str UTF8String]);
241                        printf("</Word>\n");
242                    }
243                    printf("</Line>\n");
244                }
245
246                if (cur_flow_id != -1) {
247                    if (cur_para_id != -1) {
248                        cur_para_id = -1;
249                        printf("</Para>\n");
250                    }
251                    printf("</Flow>\n");
252                }
253                printf("</PDFText>\n");
254            }
255        }
256        @catch(NSException *e)
257        {
258            NSLog(@"%@", e.reason);
259            ret = 1;
260        }
261
262        if(example5_low_level)
263        {
264            @try
265            {
266                PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
267                [doc InitSecurityHandler];
268
269                // Example 1. Extract all text content from the document
270
271                PTElementReader *reader = [[PTElementReader alloc] init];
272                //  Read every page
273                PTPageIterator *itr;
274                for (itr=[doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) 
275                {                
276                    [reader Begin: [itr Current]];
277                    DumpAllText(reader);
278                    [reader End];
279                }
280
281                // Example 2. Extract text content based on the 
282                // selection rectangle.
283                NSLog(@"\n----------------------------------------------------");
284                NSLog(@"\nExtract text based on the selection rectangle.");
285                NSLog(@"\n----------------------------------------------------\n");
286
287                PTPage *first_page = [doc GetPage: 1];
288                PTPDFRect * rect1 = [[PTPDFRect alloc] initWithX1: 27 y1: 392 x2: 563 y2: 534];
289                NSString *s1 = ReadTextFromRect(first_page, rect1, reader);
290                NSLog(@"\nField 1: %@", s1);
291
292                PTPDFRect * rect2 = [[PTPDFRect alloc] initWithX1: 28 y1: 551 x2: 106 y2: 623];
293                s1 = ReadTextFromRect(first_page, rect2, reader);
294                NSLog(@"\nField 2: %@", s1);
295
296                PTPDFRect * rect3 = [[PTPDFRect alloc] initWithX1: 208 y1: 550 x2: 387 y2: 621];
297                s1 = ReadTextFromRect(first_page, rect3, reader);
298                NSLog(@"\nField 3: %@", s1);
299
300                // ... 
301                NSLog(@"Done.");
302            }
303            @catch(NSException *e)
304            {
305                NSLog(@"%@", e.reason);
306                ret = 1;
307            }
308        }
309        [PTPDFNet Terminate: 0];
310        return ret;
311    }
312}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import PDFNet
7import Foundation
8
9// This sample illustrates the basic text extraction capabilities of PDFNet.
10
11// A utility method used to dump all text content in the console window.
12func DumpAllText(_ reader: PTElementReader) {
13    while let element = reader.next() {
14        switch element.getType() {
15        case e_pttext_begin:
16            print("--> Text Block Begin")
17        case e_pttext_end:
18            print("--> Text Block End")
19        case e_pttext_obj:
20            let bbox: PTPDFRect = element.getBBox()
21            print("--> BBox: \(bbox.getX1()), \(bbox.getY1()), \(bbox.getX2()), \(bbox.getY2())")
22            print("\(element.getTextString()!)")
23        case e_pttext_new_line:
24            print("--> New Line")
25        case e_ptform:
26            // Process form XObjects
27            reader.formBegin()
28            DumpAllText(reader)
29            reader.end()
30        default:
31            break
32        }
33    }
34}
35
36// A helper method for ReadTextFromRect
37func RectTextSearch(reader: PTElementReader, pos: PTPDFRect, srch_str: inout String) {
38    while let element = reader.next() {
39        switch element.getType() {
40        case e_pttext_obj:
41            let bbox: PTPDFRect = element.getBBox()
42            if bbox.intersect(bbox, rect2: pos) {
43                let arr = element.getTextString()
44                srch_str += (arr ?? "")
45                srch_str += ("\n")    // add a new line?
46            }
47        case e_pttext_new_line:
48            break
49        case e_ptform:
50            // Process form XObjects
51            reader.formBegin()
52            RectTextSearch(reader: reader, pos: pos, srch_str: &srch_str)
53            reader.end()
54        default:
55            break
56        }
57    }
58}
59
60// A utility method used to extract all text content from
61// a given selection rectangle. The rectangle coordinates are
62// expressed in PDF user/page coordinate system.
63func ReadTextFromRect(page: PTPage, pos: PTPDFRect, reader: PTElementReader) -> String {
64    var srch_str = ""
65    reader.begin(page)
66    RectTextSearch(reader: reader, pos: pos, srch_str: &srch_str)
67    reader.end()
68    return srch_str
69}
70
71func PrintStyle(_ s: PTTextExtractorStyle) {
72    let rgb: NSMutableArray = s.getColor()
73    print(" style=\"font-family:\(s.getFontName()!); font-size:\(s.getFontSize()); sans-serif: \(s.isSerif()); color: #\(rgb[0]), \(rgb[1]), \(rgb[2])\"")
74}
75
76func runTextExtractTest() -> Int {
77    return autoreleasepool {
78        var ret = 0
79        
80        
81        let example1_basic = true
82        let example2_xml = true
83        let example3_wordlist = true
84        let example4_advanced = true
85        let example5_low_level = false
86        
87        // Sample code showing how to use high-level text extraction APIs.
88        do {
89            try PTPDFNet.catchException {
90                let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
91                doc.initSecurityHandler()
92                
93                guard let page: PTPage = doc.getPage(1) else {
94                    print("Page not found.")
95                    ret = 1
96                    return
97                }
98                
99                let txt: PTTextExtractor = PTTextExtractor()
100                txt.begin(page, clip_ptr: nil, flags: 0)    // Read the page.
101                // Other options you may want to consider...
102                // txt.begin(page, nil, e_ptno_dup_remove);
103                // txt.begin(page, nil, e_ptremove_hidden_text);
104                
105                // Example 1. Get all text on the page in a single string.
106                // Words will be separated with space or new line characters.
107                if example1_basic {
108                    // Get the word count.
109                    print("Word Count: \(txt.getWordCount())")
110                    
111                    let text: String = txt.getAsText(true)
112                    print("\n\n- GetAsText --------------------------\n\(text)")
113                    print("-----------------------------------------------------------")
114                }
115                
116                // Example 2. Get XML logical structure for the page.
117                if example2_xml {
118                    let text: String = txt.getAsXML(e_ptwords_as_elements.rawValue | e_ptoutput_bbox.rawValue | e_ptoutput_style_info.rawValue)
119                    print("\n\n- GetAsXML  --------------------------\n\(text)")
120                    print("-----------------------------------------------------------")
121                }
122                
123                // Example 3. Extract words one by one.
124                if example3_wordlist {
125                    var line: PTTextExtractorLine = txt.getFirstLine()
126                    while line.isValid() {
127                        var word: PTWord = line.getFirstWord()
128                        while word.isValid() {
129                            print("\(word.getString()!)")
130                            word = word.getNext()
131                        }
132                        line = line.getNext()
133                    }
134                    print("-----------------------------------------------------------")
135                }
136                
137                // Example 4. A more advanced text extraction example.
138                // The output is XML structure containing paragraphs, lines, words,
139                // as well as style and positioning information.
140                if example4_advanced {
141                    var b: PTPDFRect
142                    var q: PTPDFRect
143                    var cur_flow_id = -1
144                    var cur_para_id = -1
145                    
146                    var uni_str = ""
147                    var line: PTTextExtractorLine
148                    var word: PTWord
149                    var s: PTTextExtractorStyle
150                    var line_style: PTTextExtractorStyle
151                    
152                    // For each line on the page...
153                    line = txt.getFirstLine()
154                    while line.isValid() {
155                        if line.getNumWords() == 0 {
156                            continue
157                        }
158                        if cur_flow_id != line.getFlowID() {
159                            if cur_flow_id != -1 {
160                                if cur_para_id != -1 {
161                                    cur_para_id = -1
162                                    print("</Para>")
163                                }
164                                print("</Flow>\n")
165                            }
166                            cur_flow_id = Int(line.getFlowID())
167                            print("<Flow id=\", \(cur_flow_id)\">\n")
168                        }
169                        if cur_para_id != line.getParagraphID() {
170                            if cur_para_id != -1 {
171                                print("</Para>\n")
172                            }
173                            cur_para_id = Int(line.getParagraphID())
174                            print("<Para id=\", \(cur_para_id)\">\n")
175                        }
176                        
177                        b = line.getBBox()
178                        line_style = line.getStyle()
179                        print("<Line box=\"\(b.getX1()), \(b.getY1()), \(b.getX2()), \(b.getY2())\"")
180                        PrintStyle(line_style)
181                        print(">\n")
182                        
183                        // For each word in the line...
184                        word = line.getFirstWord()
185                        while word.isValid() {
186                            // Output the bounding box for the word.
187                            q = word.getBBox()
188                            print("<Word box=\"\(q.getX1()), \(q.getY1()), \(q.getX2()), \(q.getY2())\"")
189                            let sz = word.getStringLen()
190                            if sz == 0 {
191                                continue
192                            }
193                            
194                            // If the word style is different from the parent style, output the new style.
195                            s = word.getStyle()
196                            if s != line_style {
197                                PrintStyle(s)
198                            }
199                            
200                            uni_str = word.getString()
201                            print(">\(uni_str)")
202                            print("</Word>\n")
203                            word = word.getNext()
204                        }
205                        print("</Line>\n")
206                        line = line.getNext()
207                    }
208                    if cur_flow_id != -1 {
209                        if cur_para_id != -1 {
210                            cur_para_id = -1
211                            print("</Para>\n")
212                        }
213                        print("</Flow>\n")
214                    }
215                }
216            }
217        } catch let e as NSError {
218            print("\(e)")
219            ret = 1
220        }
221        
222        if example5_low_level {
223//            do {
224//                try PTPDFNet.catchException {
225//                    let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
226//                    doc.initSecurityHandler()
227//
228//                    // Example 1. Extract all text content from the document
229//
230//                    let reader: PTElementReader = PTElementReader()
231//                    //  Read every page
232//                    let itr: PTPageIterator = doc.getPageIterator(1)
233//                    while itr.hasNext() {
234//                        reader.begin(itr.current())
235//                        DumpAllText(reader)
236//                        reader.end()
237//                        itr.next()
238//                    }
239//
240//                    // Example 2. Extract text content based on the
241//                    // selection rectangle.
242//                    print("\n----------------------------------------------------")
243//                    print("\nExtract text based on the selection rectangle.")
244//                    print("\n----------------------------------------------------\n")
245//
246//                    let first_page: PTPage = doc.getPageIterator(1).current()
247//                    let rect1: PTPDFRect = PTPDFRect(x1: 27, y1: 392, x2: 563, y2: 534)
248//                    var s1: String = ReadTextFromRect(page: first_page, pos: rect1, reader: reader)
249//                    print("\nField 1: \(s1)")
250//
251//                    let rect2: PTPDFRect = PTPDFRect(x1: 28, y1: 551, x2: 106, y2: 623)
252//                    s1 = ReadTextFromRect(page: first_page, pos: rect2, reader: reader)
253//                    print("\nField 2: \(s1)")
254//
255//                    let rect3: PTPDFRect = PTPDFRect(x1: 208, y1: 550, x2: 387, y2: 621)
256//                    s1 = ReadTextFromRect(page: first_page, pos: rect3, reader: reader)
257//                    print("\nField 3: \(s1)")
258//
259//                    // ...
260//                    print("Done.")
261//                }
262//            } catch let e as NSError {
263//                print("\(e)")
264//                ret = 1
265//            }
266        }
267    
268        return ret
269    }
270}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

TextExtract