Sample Obj-C code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our iOS SDK and PDF Data Extraction SDK Capabilities.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9// This sample illustrates the basic text extraction capabilities of PDFNet.
10
11// A utility method used to dump all text content in the console window.
12void DumpAllText(PTElementReader *reader)
13{
14 PTElement *element;
15 while ((element = [reader Next]) != NULL)
16 {
17 switch ([element GetType])
18 {
19 case e_pttext_begin:
20 NSLog(@"--> Text Block Begin");
21 break;
22 case e_pttext_end:
23 NSLog(@"--> Text Block End");
24 break;
25 case e_pttext:
26 {
27 PTPDFRect * bbox = [element GetBBox];
28 NSLog(@"--> BBox: %f, %f, %f, %f", [bbox GetX1], [bbox GetY1], [bbox GetX2], [bbox GetY2]);
29
30 NSLog(@"%@", [element GetTextString]);
31 }
32 break;
33 case e_pttext_new_line:
34 NSLog(@"--> New Line");
35 break;
36 case e_ptform: // Process form XObjects
37 [reader FormBegin];
38 DumpAllText(reader);
39 [reader End];
40 break;
41 default:
42 break;
43 }
44 }
45}
46
47// A helper method for ReadTextFromRect
48void RectTextSearch(PTElementReader *reader, PTPDFRect * pos, NSString **srch_str)
49{
50 PTElement *element;
51 while ((element = [reader Next]) != NULL)
52 {
53 switch ([element GetType])
54 {
55 case e_pttext:
56 {
57 PTPDFRect * bbox = [element GetBBox];
58 if([bbox IntersectRect: bbox rect2: pos])
59 {
60 NSString *arr = [element GetTextString];
61 *srch_str = [*srch_str stringByAppendingString: arr];
62 *srch_str = [*srch_str stringByAppendingString: @"\n"]; // add a new line?
63 }
64 break;
65 }
66 case e_pttext_new_line:
67 {
68 break;
69 }
70 case e_ptform: // Process form XObjects
71 {
72 [reader FormBegin];
73 RectTextSearch(reader, pos, &(*srch_str));
74 [reader End];
75 break;
76 }
77 default:
78 break;
79 }
80 }
81}
82
83// A utility method used to extract all text content from
84// a given selection rectangle. The rectangle coordinates are
85// expressed in PDF user/page coordinate system.
86NSString* ReadTextFromRect(PTPage *page, PTPDFRect * pos, PTElementReader *reader)
87{
88 NSString *srch_str = @"";
89 [reader Begin: page];
90 RectTextSearch(reader, pos, &srch_str);
91 [reader End];
92 return srch_str;
93}
94
95
96void PrintStyle(PTTextExtractorStyle *s)
97{
98 NSArray *rgb = [s GetColor];
99 NSString * name = [s GetFontName ];
100 const char* font_family = [name UTF8String];
101 double font_size = [s GetFontSize];
102 const char * san_serif = ([s IsSerif]) ? " sans-serif;" : "";
103 int R = [rgb[0] intValue];
104 int G = [rgb[1] intValue];
105 int B = [rgb[2] intValue];
106 printf(" style=\"font-family:%s; font-size:%g;%s color:#%02X%02X%02X;\"", font_family, font_size, san_serif, [rgb[0] intValue], [rgb[1] intValue], [rgb[2] intValue]);
107}
108
109int main(int argc, char *argv[])
110{
111 @autoreleasepool {
112 int ret = 0;
113 [PTPDFNet Initialize: 0];
114
115 bool example1_basic = false;
116 bool example2_xml = false;
117 bool example3_wordlist = false;
118 bool example4_advanced = true;
119 bool example5_low_level = false;
120
121 // Sample code showing how to use high-level text extraction APIs.
122 @try
123 {
124 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
125 [doc InitSecurityHandler];
126
127 PTPage *page = [doc GetPage: 1];
128 if (![page IsValid]){
129 NSLog(@"Page not found.");
130 return 1;
131 }
132
133 PTTextExtractor *txt = [[PTTextExtractor alloc] init];
134 [txt Begin: page clip_ptr: 0 flags: 0]; // Read the page.
135 // Other options you may want to consider...
136 // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
137 // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
138
139
140 // Example 1. Get all text on the page in a single string.
141 // Words will be separated with space or new line characters.
142 if (example1_basic)
143 {
144 // Get the word count.
145 NSLog(@"Word Count: %d", [txt GetWordCount]);
146
147 NSString *text = [txt GetAsText: YES];
148 NSLog(@"\n\n- GetAsText --------------------------\n%@", text);
149 NSLog(@"-----------------------------------------------------------");
150 }
151
152 // Example 2. Get XML logical structure for the page.
153 if (example2_xml)
154 {
155 NSString *text = [txt GetAsXML: e_ptwords_as_elements | e_ptoutput_bbox | e_ptoutput_style_info];
156 NSLog(@"\n\n- GetAsXML --------------------------\n %@", text);
157 NSLog(@"-----------------------------------------------------------");
158 }
159
160 // Example 3. Extract words one by one.
161 if (example3_wordlist)
162 {
163 PTTextExtractorLine *line = [txt GetFirstLine];
164 PTWord *word;
165 for (; [line IsValid]; line=[line GetNextLine]) {
166 for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord]) {
167 NSLog(@"%@", [word GetString]);
168 }
169 }
170 NSLog(@"-----------------------------------------------------------");
171 }
172
173 // Example 4. A more advanced text extraction example.
174 // The output is XML structure containing paragraphs, lines, words,
175 // as well as style and positioning information.
176 if (example4_advanced)
177 {
178 PTPDFRect * b, *q;
179 int cur_flow_id=-1, cur_para_id=-1;
180
181 NSString *uni_str;
182 PTTextExtractorLine *line;
183 PTWord *word;
184 PTTextExtractorStyle *s, *line_style;
185
186 printf("<PDFText>\n");
187 // For each line on the page...
188 for (line=[txt GetFirstLine]; [line IsValid]; line=[line GetNextLine])
189 {
190 if ( [line GetNumWords] == 0 )
191 {
192 continue;
193 }
194
195 if (cur_flow_id != [line GetFlowID]) {
196 if (cur_flow_id != -1) {
197 if (cur_para_id != -1) {
198 cur_para_id = -1;
199 printf("</Para>\n");
200 }
201 printf("</Flow>\n");
202 }
203 cur_flow_id = [line GetFlowID];
204 printf("<Flow id=\"%d\">\n", cur_flow_id);
205 }
206
207 if (cur_para_id != [line GetParagraphID]) {
208 if (cur_para_id != -1)
209 printf("</Para>\n");
210 cur_para_id = [line GetParagraphID];
211 printf("<Para id=\"%d\">\n", cur_para_id);
212 }
213
214 b = [line GetBBox];
215 line_style = [line GetStyle];
216 printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", [b GetX1], [b GetY1], [b GetX2], [b GetY2]);
217 PrintStyle(line_style);
218 printf(" cur_num=\"%d\"", [line GetCurrentNum]);
219 printf(">\n");
220
221 // For each word in the line...
222 for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord])
223 {
224 // Output the bounding box for the word.
225 q = [word GetBBox];
226 printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", [q GetX1], [q GetY1], [q GetX2], [q GetY2]);
227 printf(" cur_num=\"%d\"", [word GetCurrentNum]);
228
229 int sz = [word GetStringLen];
230 if (sz == 0) continue;
231
232 // If the word style is different from the parent style, output the new style.
233
234 s = [word GetStyle];
235 if(![s isEqualTo:line_style]){
236 PrintStyle(s);
237 }
238
239 uni_str = [word GetString];
240 printf(">%s", [uni_str UTF8String]);
241 printf("</Word>\n");
242 }
243 printf("</Line>\n");
244 }
245
246 if (cur_flow_id != -1) {
247 if (cur_para_id != -1) {
248 cur_para_id = -1;
249 printf("</Para>\n");
250 }
251 printf("</Flow>\n");
252 }
253 printf("</PDFText>\n");
254 }
255 }
256 @catch(NSException *e)
257 {
258 NSLog(@"%@", e.reason);
259 ret = 1;
260 }
261
262 if(example5_low_level)
263 {
264 @try
265 {
266 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
267 [doc InitSecurityHandler];
268
269 // Example 1. Extract all text content from the document
270
271 PTElementReader *reader = [[PTElementReader alloc] init];
272 // Read every page
273 PTPageIterator *itr;
274 for (itr=[doc GetPageIterator: 1]; [itr HasNext]; [itr Next])
275 {
276 [reader Begin: [itr Current]];
277 DumpAllText(reader);
278 [reader End];
279 }
280
281 // Example 2. Extract text content based on the
282 // selection rectangle.
283 NSLog(@"\n----------------------------------------------------");
284 NSLog(@"\nExtract text based on the selection rectangle.");
285 NSLog(@"\n----------------------------------------------------\n");
286
287 PTPage *first_page = [doc GetPage: 1];
288 PTPDFRect * rect1 = [[PTPDFRect alloc] initWithX1: 27 y1: 392 x2: 563 y2: 534];
289 NSString *s1 = ReadTextFromRect(first_page, rect1, reader);
290 NSLog(@"\nField 1: %@", s1);
291
292 PTPDFRect * rect2 = [[PTPDFRect alloc] initWithX1: 28 y1: 551 x2: 106 y2: 623];
293 s1 = ReadTextFromRect(first_page, rect2, reader);
294 NSLog(@"\nField 2: %@", s1);
295
296 PTPDFRect * rect3 = [[PTPDFRect alloc] initWithX1: 208 y1: 550 x2: 387 y2: 621];
297 s1 = ReadTextFromRect(first_page, rect3, reader);
298 NSLog(@"\nField 3: %@", s1);
299
300 // ...
301 NSLog(@"Done.");
302 }
303 @catch(NSException *e)
304 {
305 NSLog(@"%@", e.reason);
306 ret = 1;
307 }
308 }
309 [PTPDFNet Terminate: 0];
310 return ret;
311 }
312}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import PDFNet
7import Foundation
8
9// This sample illustrates the basic text extraction capabilities of PDFNet.
10
11// A utility method used to dump all text content in the console window.
12func DumpAllText(_ reader: PTElementReader) {
13 while let element = reader.next() {
14 switch element.getType() {
15 case e_pttext_begin:
16 print("--> Text Block Begin")
17 case e_pttext_end:
18 print("--> Text Block End")
19 case e_pttext_obj:
20 let bbox: PTPDFRect = element.getBBox()
21 print("--> BBox: \(bbox.getX1()), \(bbox.getY1()), \(bbox.getX2()), \(bbox.getY2())")
22 print("\(element.getTextString()!)")
23 case e_pttext_new_line:
24 print("--> New Line")
25 case e_ptform:
26 // Process form XObjects
27 reader.formBegin()
28 DumpAllText(reader)
29 reader.end()
30 default:
31 break
32 }
33 }
34}
35
36// A helper method for ReadTextFromRect
37func RectTextSearch(reader: PTElementReader, pos: PTPDFRect, srch_str: inout String) {
38 while let element = reader.next() {
39 switch element.getType() {
40 case e_pttext_obj:
41 let bbox: PTPDFRect = element.getBBox()
42 if bbox.intersect(bbox, rect2: pos) {
43 let arr = element.getTextString()
44 srch_str += (arr ?? "")
45 srch_str += ("\n") // add a new line?
46 }
47 case e_pttext_new_line:
48 break
49 case e_ptform:
50 // Process form XObjects
51 reader.formBegin()
52 RectTextSearch(reader: reader, pos: pos, srch_str: &srch_str)
53 reader.end()
54 default:
55 break
56 }
57 }
58}
59
60// A utility method used to extract all text content from
61// a given selection rectangle. The rectangle coordinates are
62// expressed in PDF user/page coordinate system.
63func ReadTextFromRect(page: PTPage, pos: PTPDFRect, reader: PTElementReader) -> String {
64 var srch_str = ""
65 reader.begin(page)
66 RectTextSearch(reader: reader, pos: pos, srch_str: &srch_str)
67 reader.end()
68 return srch_str
69}
70
71func PrintStyle(_ s: PTTextExtractorStyle) {
72 let rgb: NSMutableArray = s.getColor()
73 print(" style=\"font-family:\(s.getFontName()!); font-size:\(s.getFontSize()); sans-serif: \(s.isSerif()); color: #\(rgb[0]), \(rgb[1]), \(rgb[2])\"")
74}
75
76func runTextExtractTest() -> Int {
77 return autoreleasepool {
78 var ret = 0
79
80
81 let example1_basic = true
82 let example2_xml = true
83 let example3_wordlist = true
84 let example4_advanced = true
85 let example5_low_level = false
86
87 // Sample code showing how to use high-level text extraction APIs.
88 do {
89 try PTPDFNet.catchException {
90 let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
91 doc.initSecurityHandler()
92
93 guard let page: PTPage = doc.getPage(1) else {
94 print("Page not found.")
95 ret = 1
96 return
97 }
98
99 let txt: PTTextExtractor = PTTextExtractor()
100 txt.begin(page, clip_ptr: nil, flags: 0) // Read the page.
101 // Other options you may want to consider...
102 // txt.begin(page, nil, e_ptno_dup_remove);
103 // txt.begin(page, nil, e_ptremove_hidden_text);
104
105 // Example 1. Get all text on the page in a single string.
106 // Words will be separated with space or new line characters.
107 if example1_basic {
108 // Get the word count.
109 print("Word Count: \(txt.getWordCount())")
110
111 let text: String = txt.getAsText(true)
112 print("\n\n- GetAsText --------------------------\n\(text)")
113 print("-----------------------------------------------------------")
114 }
115
116 // Example 2. Get XML logical structure for the page.
117 if example2_xml {
118 let text: String = txt.getAsXML(e_ptwords_as_elements.rawValue | e_ptoutput_bbox.rawValue | e_ptoutput_style_info.rawValue)
119 print("\n\n- GetAsXML --------------------------\n\(text)")
120 print("-----------------------------------------------------------")
121 }
122
123 // Example 3. Extract words one by one.
124 if example3_wordlist {
125 var line: PTTextExtractorLine = txt.getFirstLine()
126 while line.isValid() {
127 var word: PTWord = line.getFirstWord()
128 while word.isValid() {
129 print("\(word.getString()!)")
130 word = word.getNext()
131 }
132 line = line.getNext()
133 }
134 print("-----------------------------------------------------------")
135 }
136
137 // Example 4. A more advanced text extraction example.
138 // The output is XML structure containing paragraphs, lines, words,
139 // as well as style and positioning information.
140 if example4_advanced {
141 var b: PTPDFRect
142 var q: PTPDFRect
143 var cur_flow_id = -1
144 var cur_para_id = -1
145
146 var uni_str = ""
147 var line: PTTextExtractorLine
148 var word: PTWord
149 var s: PTTextExtractorStyle
150 var line_style: PTTextExtractorStyle
151
152 // For each line on the page...
153 line = txt.getFirstLine()
154 while line.isValid() {
155 if line.getNumWords() == 0 {
156 continue
157 }
158 if cur_flow_id != line.getFlowID() {
159 if cur_flow_id != -1 {
160 if cur_para_id != -1 {
161 cur_para_id = -1
162 print("</Para>")
163 }
164 print("</Flow>\n")
165 }
166 cur_flow_id = Int(line.getFlowID())
167 print("<Flow id=\", \(cur_flow_id)\">\n")
168 }
169 if cur_para_id != line.getParagraphID() {
170 if cur_para_id != -1 {
171 print("</Para>\n")
172 }
173 cur_para_id = Int(line.getParagraphID())
174 print("<Para id=\", \(cur_para_id)\">\n")
175 }
176
177 b = line.getBBox()
178 line_style = line.getStyle()
179 print("<Line box=\"\(b.getX1()), \(b.getY1()), \(b.getX2()), \(b.getY2())\"")
180 PrintStyle(line_style)
181 print(">\n")
182
183 // For each word in the line...
184 word = line.getFirstWord()
185 while word.isValid() {
186 // Output the bounding box for the word.
187 q = word.getBBox()
188 print("<Word box=\"\(q.getX1()), \(q.getY1()), \(q.getX2()), \(q.getY2())\"")
189 let sz = word.getStringLen()
190 if sz == 0 {
191 continue
192 }
193
194 // If the word style is different from the parent style, output the new style.
195 s = word.getStyle()
196 if s != line_style {
197 PrintStyle(s)
198 }
199
200 uni_str = word.getString()
201 print(">\(uni_str)")
202 print("</Word>\n")
203 word = word.getNext()
204 }
205 print("</Line>\n")
206 line = line.getNext()
207 }
208 if cur_flow_id != -1 {
209 if cur_para_id != -1 {
210 cur_para_id = -1
211 print("</Para>\n")
212 }
213 print("</Flow>\n")
214 }
215 }
216 }
217 } catch let e as NSError {
218 print("\(e)")
219 ret = 1
220 }
221
222 if example5_low_level {
223// do {
224// try PTPDFNet.catchException {
225// let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
226// doc.initSecurityHandler()
227//
228// // Example 1. Extract all text content from the document
229//
230// let reader: PTElementReader = PTElementReader()
231// // Read every page
232// let itr: PTPageIterator = doc.getPageIterator(1)
233// while itr.hasNext() {
234// reader.begin(itr.current())
235// DumpAllText(reader)
236// reader.end()
237// itr.next()
238// }
239//
240// // Example 2. Extract text content based on the
241// // selection rectangle.
242// print("\n----------------------------------------------------")
243// print("\nExtract text based on the selection rectangle.")
244// print("\n----------------------------------------------------\n")
245//
246// let first_page: PTPage = doc.getPageIterator(1).current()
247// let rect1: PTPDFRect = PTPDFRect(x1: 27, y1: 392, x2: 563, y2: 534)
248// var s1: String = ReadTextFromRect(page: first_page, pos: rect1, reader: reader)
249// print("\nField 1: \(s1)")
250//
251// let rect2: PTPDFRect = PTPDFRect(x1: 28, y1: 551, x2: 106, y2: 623)
252// s1 = ReadTextFromRect(page: first_page, pos: rect2, reader: reader)
253// print("\nField 2: \(s1)")
254//
255// let rect3: PTPDFRect = PTPDFRect(x1: 208, y1: 550, x2: 387, y2: 621)
256// s1 = ReadTextFromRect(page: first_page, pos: rect3, reader: reader)
257// print("\nField 3: \(s1)")
258//
259// // ...
260// print("Done.")
261// }
262// } catch let e as NSError {
263// print("\(e)")
264// ret = 1
265// }
266 }
267
268 return ret
269 }
270}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales