Sample Obj-C code shows how to use the Apryse OCR module on scanned documents in multiple languages. The OCR module can make searchable PDFs and extract scanned text for further indexing. Learn more about our iOS SDK.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9//---------------------------------------------------------------------------------------
10// The following sample illustrates how to use OCR module
11//---------------------------------------------------------------------------------------
12int main(int argc, char *argv[])
13{
14 @autoreleasepool {
15 @try
16 {
17 // The first step in every application using PDFNet is to initialize the
18 // library and set the path to common PDF resources. The library is usually
19 // initialized only once, but calling Initialize() multiple times is also fine.
20 [PTPDFNet Initialize: 0];
21
22 //--------------------------------------------------------------------------------
23 // Example 1) Process image without specifying options, default language - English - is used
24
25 @try
26 {
27 // A) Setup empty destination doc
28
29 PTPDFDoc * doc = [[PTPDFDoc alloc] init];
30
31 // B) Run OCR on the .png with options
32
33 NSString * img = @"../../TestFiles/psychomachia_excerpt.png";
34
35 [PTOCRModule ImageToPDF: doc src: img options: nil];
36
37 // C) check the result
38
39 NSString * outputFile = @"../../TestFiles/Output/psychomachia_excerpt.pdf";
40 [doc SaveToFile: outputFile flags: e_ptremove_unused];
41
42 NSLog(@"Example 1 psychomachia_excerpt.png\n");
43 }
44 @catch(NSException *e)
45 {
46 NSLog(@"%@", e.reason);
47 }
48
49 //--------------------------------------------------------------------------------
50 // Example 2) Process document using multiple languages
51
52 @try
53 {
54 // A) Setup empty destination doc
55
56 PTPDFDoc * doc = [[PTPDFDoc alloc] init];
57
58 // B) Setup options with multiple target languages, English will always be considered as secondary language
59
60 PTObjSet * set = [[PTObjSet alloc] init];
61 PTObj * options = [set CreateDict];
62 PTObj * lang_array = [options PutArray: @"Langs"];
63 [lang_array PushBackString: @"rus"];
64 [lang_array PushBackString: @"deu"];
65
66 // C) Run OCR on the .jpg with options
67
68 NSString * img = @"../../TestFiles/multi_lang.jpg";
69
70 [PTOCRModule ImageToPDF: doc src: img options: options];
71
72 // D) check the result
73
74 NSString * outputFile = @"../../TestFiles/Output/multi_lang.pdf";
75 [doc SaveToFile: outputFile flags: e_ptremove_unused];
76
77 NSLog(@"Example 2: multi_lang.jpg\n");
78 }
79 @catch(NSException *e)
80 {
81 NSLog(@"%@", e.reason);
82 }
83
84 //--------------------------------------------------------------------------------
85 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
86 @try
87 {
88 // A) Open the .pdf document
89
90 PTPDFDoc * doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/german_kids_song.pdf"];
91
92 // B) Setup options with a single language and an ignore zone
93
94 PTObjSet * set = [[PTObjSet alloc] init];
95 PTObj * options = [set CreateDict];
96 PTObj * lang_array = [options PutArray: @"Langs"];
97 [lang_array PushBackString: @"deu"];
98
99 PTObj * ignore_zones = [options PutArray: @"IgnoreZones"];
100 PTObj * ignore_zones_page1 = [ignore_zones PushBackArray];
101 [ignore_zones_page1 PushBackRect: 1768 y1: 680 x2: 2056 y2: 3044 ];
102
103 // C) Run OCR on the .pdf with options
104
105 [PTOCRModule ProcessPDF: doc options: options];
106
107 // D) check the result
108
109 NSString * outputFile = @"../../TestFiles/Output/german_kids_song.pdf";
110 [doc SaveToFile: outputFile flags: e_ptremove_unused];
111
112 NSLog(@"Example 3: german_kids_song.pdf\n");
113 }
114 @catch(NSException *e)
115 {
116 NSLog(@"%@", e.reason);
117 }
118
119
120 //--------------------------------------------------------------------------------
121 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
122 @try
123 {
124 // A) Setup empty destination doc
125
126 PTPDFDoc * doc = [[PTPDFDoc alloc] init];
127
128 // B) Setup options with a single language plus text/ignore zones
129
130 PTObjSet * set = [[PTObjSet alloc] init];
131 PTObj * options = [set CreateDict];
132 PTObj * lang_array = [options PutArray: @"Langs"];
133 [lang_array PushBackString: @"eng"];
134
135 PTObj * ignore_zones = [options PutArray: @"IgnoreZones"];
136 PTObj * ignore_zones_page1 = [ignore_zones PushBackArray];
137 PTObj * ignore_zones_page2 = [ignore_zones PushBackArray];
138 PTObj * ignore_zones_page3 = [ignore_zones PushBackArray];
139
140 // ignore signature box in the first 2 pages
141 [ignore_zones_page1 PushBackRect: 1492 y1: 56 x2: 2236 y2: 432 ];
142 [ignore_zones_page2 PushBackRect: 1492 y1: 56 x2: 2236 y2: 432 ];
143
144 // can use a combination of ignore and text boxes to focus on the page area of interest,
145 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
146 [ignore_zones_page3 PushBackRect: 992 y1: 1276 x2: 1368 y2: 1372 ];
147
148 PTObj * text_zones = [options PutArray: @"TextZones"];
149
150 // we only have text zones selected in page 3, so text_zones_page1 and text_zones_page2 will be empty
151 PTObj * text_zones_page1 = [text_zones PushBackArray];
152 PTObj * text_zones_page2 = [text_zones PushBackArray];
153 PTObj * text_zones_page3 = [text_zones PushBackArray];
154
155 // select horizontal BUFFER ZONE sign
156 [text_zones_page3 PushBackRect: 900 y1: 2384 x2: 1236 y2: 2480 ];
157
158 // select right vertical BUFFER ZONE sign
159 [text_zones_page3 PushBackRect: 1960 y1: 1976 x2: 2016 y2: 2296 ];
160
161 // select Lot No.
162 [text_zones_page3 PushBackRect: 696 y1: 1028 x2: 1196 y2: 1128 ];
163
164 // select part of the plan inside the BUFFER ZONE
165 [text_zones_page3 PushBackRect: 428 y1: 1484 x2: 1784 y2: 2344 ];
166 [text_zones_page3 PushBackRect: 948 y1: 1288 x2: 1672 y2: 1476 ];
167
168 // C) Run OCR on the .pdf with options
169
170 NSString * img = @"../../TestFiles/bc_environment_protection.tif";
171
172 [PTOCRModule ImageToPDF: doc src: img options: options];
173
174 // D) check the result
175
176 NSString * outputFile = @"../../TestFiles/Output/bc_environment_protection.pdf";
177 [doc SaveToFile: outputFile flags: e_ptremove_unused];
178
179 NSLog(@"Example 4: bc_environment_protection.tif\n");
180 }
181 @catch(NSException *e)
182 {
183 NSLog(@"%@", e.reason);
184 }
185
186 //--------------------------------------------------------------------------------
187 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
188 // out special characters), and finally applying modified OCR JSON to the source PDF document
189 @try
190 {
191 // A) Open the .pdf document
192
193 PTPDFDoc * doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/zero_value_test_no_text.pdf"];
194
195 // B) Run OCR on the .pdf with default English languages
196
197 NSString * json = [PTOCRModule GetOCRJsonFromPDF: doc options: nil];
198
199 // C) Post-processing step (whatever it might be), but we just print json here
200
201 NSLog(@"OCR result JSON: %@", json);
202
203 // D) Apply potentially modified OCR JSON to the PDF
204
205 [PTOCRModule ApplyOCRJsonToPDF: doc json: json];
206
207 // E) Check the result
208
209 NSString * outputFile = @"../../TestFiles/Output/zero_value_test_no_text.pdf";
210 [doc SaveToFile: outputFile flags: e_ptremove_unused];
211
212 NSLog(@"Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf\n");
213 }
214 @catch(NSException *e)
215 {
216 NSLog(@"%@", e.reason);
217 }
218
219 //--------------------------------------------------------------------------------
220 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
221 @try
222 {
223
224 // A) Setup empty destination doc
225
226 PTPDFDoc * doc = [[PTPDFDoc alloc] init];
227
228 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
229 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
230
231 NSString * img = @"../../TestFiles/physics.tif";
232
233 NSString * xml = [PTOCRModule GetOCRXmlFromImage: doc src: img options: nil];
234
235 // C) Post-processing step (whatever it might be), but we just print XML here
236
237 NSLog(@"OCR result XML: %@", xml);
238
239 // D) Apply potentially modified OCR XML to the PDF
240
241 [PTOCRModule ApplyOCRXmlToPDF: doc xml: xml];
242
243 // E) Check the result
244
245 NSString * outputFile = @"../../TestFiles/Output/physics.pdf";
246 [doc SaveToFile: outputFile flags: e_ptremove_unused];
247
248 NSLog(@"Example 6: extracting and applying OCR XML from physics.tif\n");
249
250 }
251 @catch(NSException *e)
252 {
253 NSLog(@"%@", e.reason);
254 }
255
256 }
257 @catch(NSException *e)
258 {
259 NSLog(@"%@", e.reason);
260 }
261
262 return 0;
263 }
264}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales