TextExtract

Sample Obj-C code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our iOS SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9// This sample illustrates the basic text extraction capabilities of PDFNet.
10
11// A utility method used to dump all text content in the console window.
12void DumpAllText(PTElementReader *reader)
13{
14 PTElement *element;
15 while ((element = [reader Next]) != NULL)
16 {
17 switch ([element GetType])
18 {
19 case e_pttext_begin:
20 NSLog(@"--> Text Block Begin");
21 break;
22 case e_pttext_end:
23 NSLog(@"--> Text Block End");
24 break;
25 case e_pttext:
26 {
27 PTPDFRect * bbox = [element GetBBox];
28 NSLog(@"--> BBox: %f, %f, %f, %f", [bbox GetX1], [bbox GetY1], [bbox GetX2], [bbox GetY2]);
29
30 NSLog(@"%@", [element GetTextString]);
31 }
32 break;
33 case e_pttext_new_line:
34 NSLog(@"--> New Line");
35 break;
36 case e_ptform: // Process form XObjects
37 [reader FormBegin];
38 DumpAllText(reader);
39 [reader End];
40 break;
41 default:
42 break;
43 }
44 }
45}
46
47// A helper method for ReadTextFromRect
48void RectTextSearch(PTElementReader *reader, PTPDFRect * pos, NSString **srch_str)
49{
50 PTElement *element;
51 while ((element = [reader Next]) != NULL)
52 {
53 switch ([element GetType])
54 {
55 case e_pttext:
56 {
57 PTPDFRect * bbox = [element GetBBox];
58 if([bbox IntersectRect: bbox rect2: pos])
59 {
60 NSString *arr = [element GetTextString];
61 *srch_str = [*srch_str stringByAppendingString: arr];
62 *srch_str = [*srch_str stringByAppendingString: @"\n"]; // add a new line?
63 }
64 break;
65 }
66 case e_pttext_new_line:
67 {
68 break;
69 }
70 case e_ptform: // Process form XObjects
71 {
72 [reader FormBegin];
73 RectTextSearch(reader, pos, &(*srch_str));
74 [reader End];
75 break;
76 }
77 default:
78 break;
79 }
80 }
81}
82
83// A utility method used to extract all text content from
84// a given selection rectangle. The rectangle coordinates are
85// expressed in PDF user/page coordinate system.
86NSString* ReadTextFromRect(PTPage *page, PTPDFRect * pos, PTElementReader *reader)
87{
88 NSString *srch_str = @"";
89 [reader Begin: page];
90 RectTextSearch(reader, pos, &srch_str);
91 [reader End];
92 return srch_str;
93}
94
95
96void PrintStyle(PTTextExtractorStyle *s)
97{
98 NSArray *rgb = [s GetColor];
99 NSString * name = [s GetFontName ];
100 const char* font_family = [name UTF8String];
101 double font_size = [s GetFontSize];
102 const char * san_serif = ([s IsSerif]) ? " sans-serif;" : "";
103 int R = [rgb[0] intValue];
104 int G = [rgb[1] intValue];
105 int B = [rgb[2] intValue];
106 printf(" style=\"font-family:%s; font-size:%g;%s color:#%02X%02X%02X;\"", font_family, font_size, san_serif, [rgb[0] intValue], [rgb[1] intValue], [rgb[2] intValue]);
107}
108
109int main(int argc, char *argv[])
110{
111 @autoreleasepool {
112 int ret = 0;
113 [PTPDFNet Initialize: 0];
114
115 bool example1_basic = false;
116 bool example2_xml = false;
117 bool example3_wordlist = false;
118 bool example4_advanced = true;
119 bool example5_low_level = false;
120
121 // Sample code showing how to use high-level text extraction APIs.
122 @try
123 {
124 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
125 [doc InitSecurityHandler];
126
127 PTPage *page = [doc GetPage: 1];
128 if (![page IsValid]){
129 NSLog(@"Page not found.");
130 return 1;
131 }
132
133 PTTextExtractor *txt = [[PTTextExtractor alloc] init];
134 [txt Begin: page clip_ptr: 0 flags: 0]; // Read the page.
135 // Other options you may want to consider...
136 // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
137 // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
138
139
140 // Example 1. Get all text on the page in a single string.
141 // Words will be separated with space or new line characters.
142 if (example1_basic)
143 {
144 // Get the word count.
145 NSLog(@"Word Count: %d", [txt GetWordCount]);
146
147 NSString *text = [txt GetAsText: YES];
148 NSLog(@"\n\n- GetAsText --------------------------\n%@", text);
149 NSLog(@"-----------------------------------------------------------");
150 }
151
152 // Example 2. Get XML logical structure for the page.
153 if (example2_xml)
154 {
155 NSString *text = [txt GetAsXML: e_ptwords_as_elements | e_ptoutput_bbox | e_ptoutput_style_info];
156 NSLog(@"\n\n- GetAsXML --------------------------\n %@", text);
157 NSLog(@"-----------------------------------------------------------");
158 }
159
160 // Example 3. Extract words one by one.
161 if (example3_wordlist)
162 {
163 PTTextExtractorLine *line = [txt GetFirstLine];
164 PTWord *word;
165 for (; [line IsValid]; line=[line GetNextLine]) {
166 for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord]) {
167 NSLog(@"%@", [word GetString]);
168 }
169 }
170 NSLog(@"-----------------------------------------------------------");
171 }
172
173 // Example 4. A more advanced text extraction example.
174 // The output is XML structure containing paragraphs, lines, words,
175 // as well as style and positioning information.
176 if (example4_advanced)
177 {
178 PTPDFRect * b, *q;
179 int cur_flow_id=-1, cur_para_id=-1;
180
181 NSString *uni_str;
182 PTTextExtractorLine *line;
183 PTWord *word;
184 PTTextExtractorStyle *s, *line_style;
185
186 printf("<PDFText>\n");
187 // For each line on the page...
188 for (line=[txt GetFirstLine]; [line IsValid]; line=[line GetNextLine])
189 {
190 if ( [line GetNumWords] == 0 )
191 {
192 continue;
193 }
194
195 if (cur_flow_id != [line GetFlowID]) {
196 if (cur_flow_id != -1) {
197 if (cur_para_id != -1) {
198 cur_para_id = -1;
199 printf("</Para>\n");
200 }
201 printf("</Flow>\n");
202 }
203 cur_flow_id = [line GetFlowID];
204 printf("<Flow id=\"%d\">\n", cur_flow_id);
205 }
206
207 if (cur_para_id != [line GetParagraphID]) {
208 if (cur_para_id != -1)
209 printf("</Para>\n");
210 cur_para_id = [line GetParagraphID];
211 printf("<Para id=\"%d\">\n", cur_para_id);
212 }
213
214 b = [line GetBBox];
215 line_style = [line GetStyle];
216 printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", [b GetX1], [b GetY1], [b GetX2], [b GetY2]);
217 PrintStyle(line_style);
218 printf(" cur_num=\"%d\"", [line GetCurrentNum]);
219 printf(">\n");
220
221 // For each word in the line...
222 for (word=[line GetFirstWord]; [word IsValid]; word=[word GetNextWord])
223 {
224 // Output the bounding box for the word.
225 q = [word GetBBox];
226 printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", [q GetX1], [q GetY1], [q GetX2], [q GetY2]);
227 printf(" cur_num=\"%d\"", [word GetCurrentNum]);
228
229 int sz = [word GetStringLen];
230 if (sz == 0) continue;
231
232 // If the word style is different from the parent style, output the new style.
233
234 s = [word GetStyle];
235 if(![s isEqualTo:line_style]){
236 PrintStyle(s);
237 }
238
239 uni_str = [word GetString];
240 printf(">%s", [uni_str UTF8String]);
241 printf("</Word>\n");
242 }
243 printf("</Line>\n");
244 }
245
246 if (cur_flow_id != -1) {
247 if (cur_para_id != -1) {
248 cur_para_id = -1;
249 printf("</Para>\n");
250 }
251 printf("</Flow>\n");
252 }
253 printf("</PDFText>\n");
254 }
255 }
256 @catch(NSException *e)
257 {
258 NSLog(@"%@", e.reason);
259 ret = 1;
260 }
261
262 if(example5_low_level)
263 {
264 @try
265 {
266 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
267 [doc InitSecurityHandler];
268
269 // Example 1. Extract all text content from the document
270
271 PTElementReader *reader = [[PTElementReader alloc] init];
272 // Read every page
273 PTPageIterator *itr;
274 for (itr=[doc GetPageIterator: 1]; [itr HasNext]; [itr Next])
275 {
276 [reader Begin: [itr Current]];
277 DumpAllText(reader);
278 [reader End];
279 }
280
281 // Example 2. Extract text content based on the
282 // selection rectangle.
283 NSLog(@"\n----------------------------------------------------");
284 NSLog(@"\nExtract text based on the selection rectangle.");
285 NSLog(@"\n----------------------------------------------------\n");
286
287 PTPage *first_page = [doc GetPage: 1];
288 PTPDFRect * rect1 = [[PTPDFRect alloc] initWithX1: 27 y1: 392 x2: 563 y2: 534];
289 NSString *s1 = ReadTextFromRect(first_page, rect1, reader);
290 NSLog(@"\nField 1: %@", s1);
291
292 PTPDFRect * rect2 = [[PTPDFRect alloc] initWithX1: 28 y1: 551 x2: 106 y2: 623];
293 s1 = ReadTextFromRect(first_page, rect2, reader);
294 NSLog(@"\nField 2: %@", s1);
295
296 PTPDFRect * rect3 = [[PTPDFRect alloc] initWithX1: 208 y1: 550 x2: 387 y2: 621];
297 s1 = ReadTextFromRect(first_page, rect3, reader);
298 NSLog(@"\nField 3: %@", s1);
299
300 // ...
301 NSLog(@"Done.");
302 }
303 @catch(NSException *e)
304 {
305 NSLog(@"%@", e.reason);
306 ret = 1;
307 }
308 }
309 [PTPDFNet Terminate: 0];
310 return ret;
311 }
312}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales