OCR

Sample Obj-C code shows how to use the Apryse OCR module on scanned documents in multiple languages. The OCR module can make searchable PDFs and extract scanned text for further indexing. Learn more about our iOS SDK.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9//---------------------------------------------------------------------------------------
10// The following sample illustrates how to use OCR module
11//---------------------------------------------------------------------------------------
12int main(int argc, char *argv[])
13{
14    @autoreleasepool {
15        @try
16        {
17            // The first step in every application using PDFNet is to initialize the 
18            // library and set the path to common PDF resources. The library is usually 
19            // initialized only once, but calling Initialize() multiple times is also fine.
20            [PTPDFNet Initialize: 0];
21
22			//--------------------------------------------------------------------------------
23			// Example 1) Process image without specifying options, default language - English - is used
24			
25            @try 
26            {
27                // A) Setup empty destination doc
28                
29				PTPDFDoc * doc = [[PTPDFDoc alloc] init];
30
31				// B) Run OCR on the .png with options
32
33				NSString * img = @"../../TestFiles/psychomachia_excerpt.png";
34				
35				[PTOCRModule ImageToPDF: doc src: img options: nil];
36
37				// C) check the result
38
39                NSString * outputFile = @"../../TestFiles/Output/psychomachia_excerpt.pdf";
40				[doc SaveToFile: outputFile flags: e_ptremove_unused];
41
42				NSLog(@"Example 1 psychomachia_excerpt.png\n");
43            }
44            @catch(NSException *e)	
45            {
46                NSLog(@"%@", e.reason);
47            }
48
49			//--------------------------------------------------------------------------------
50			// Example 2) Process document using multiple languages
51		
52            @try 
53            {
54                // A) Setup empty destination doc
55                
56				PTPDFDoc * doc = [[PTPDFDoc alloc] init];
57
58				// B) Setup options with multiple target languages, English will always be considered as secondary language
59
60				PTObjSet * set = [[PTObjSet alloc] init];
61				PTObj * options = [set CreateDict];
62				PTObj * lang_array = [options PutArray: @"Langs"];
63				[lang_array PushBackString: @"rus"];
64				[lang_array PushBackString: @"deu"];
65
66				// C) Run OCR on the .jpg with options
67
68				NSString * img = @"../../TestFiles/multi_lang.jpg";
69
70				[PTOCRModule ImageToPDF: doc src: img options: options];
71
72				// D) check the result
73
74                NSString * outputFile = @"../../TestFiles/Output/multi_lang.pdf";
75				[doc SaveToFile: outputFile flags: e_ptremove_unused];
76
77				NSLog(@"Example 2: multi_lang.jpg\n");
78            }
79            @catch(NSException *e)	
80            {
81                NSLog(@"%@", e.reason);
82            }
83
84			//--------------------------------------------------------------------------------
85			// Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
86			@try 
87            {
88                // A) Open the .pdf document
89                
90				PTPDFDoc * doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/german_kids_song.pdf"];
91
92			    // B) Setup options with a single language and an ignore zone
93
94				PTObjSet * set = [[PTObjSet alloc] init];
95				PTObj * options = [set CreateDict];
96				PTObj * lang_array = [options PutArray: @"Langs"];
97				[lang_array PushBackString: @"deu"];
98
99				PTObj * ignore_zones = [options PutArray: @"IgnoreZones"];
100				PTObj * ignore_zones_page1 = [ignore_zones PushBackArray];
101				[ignore_zones_page1 PushBackRect: 1768 y1: 680 x2: 2056 y2: 3044 ];
102
103			    // C) Run OCR on the .pdf with options
104
105				[PTOCRModule ProcessPDF: doc options: options];
106
107				// D) check the result
108
109                NSString * outputFile = @"../../TestFiles/Output/german_kids_song.pdf";
110				[doc SaveToFile: outputFile flags: e_ptremove_unused];
111
112				NSLog(@"Example 3: german_kids_song.pdf\n");
113            }
114            @catch(NSException *e)	
115            {
116                NSLog(@"%@", e.reason);
117            }
118
119
120			//--------------------------------------------------------------------------------
121			// Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
122			@try 
123            {
124				// A) Setup empty destination doc
125                
126				PTPDFDoc * doc = [[PTPDFDoc alloc] init];
127
128				// B) Setup options with a single language plus text/ignore zones
129
130				PTObjSet * set = [[PTObjSet alloc] init];
131				PTObj * options = [set CreateDict];
132				PTObj * lang_array = [options PutArray: @"Langs"];
133				[lang_array PushBackString: @"eng"];
134
135				PTObj * ignore_zones = [options PutArray: @"IgnoreZones"];
136				PTObj * ignore_zones_page1 = [ignore_zones PushBackArray];
137				PTObj * ignore_zones_page2 = [ignore_zones PushBackArray];
138				PTObj * ignore_zones_page3 = [ignore_zones PushBackArray];
139
140				// ignore signature box in the first 2 pages
141				[ignore_zones_page1 PushBackRect: 1492 y1: 56 x2: 2236 y2: 432 ];
142				[ignore_zones_page2 PushBackRect: 1492 y1: 56 x2: 2236 y2: 432 ];
143
144				// can use a combination of ignore and text boxes to focus on the page area of interest,
145				// as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
146				[ignore_zones_page3 PushBackRect: 992 y1: 1276 x2: 1368 y2: 1372 ];
147
148				PTObj * text_zones = [options PutArray: @"TextZones"];
149
150				// we only have text zones selected in page 3, so text_zones_page1 and text_zones_page2 will be empty
151				PTObj * text_zones_page1 = [text_zones PushBackArray];
152				PTObj * text_zones_page2 = [text_zones PushBackArray];
153				PTObj * text_zones_page3 = [text_zones PushBackArray];
154
155				// select horizontal BUFFER ZONE sign
156				[text_zones_page3 PushBackRect: 900 y1: 2384 x2: 1236 y2: 2480 ];
157
158				// select right vertical BUFFER ZONE sign
159				[text_zones_page3 PushBackRect: 1960 y1: 1976 x2: 2016 y2: 2296 ];
160
161				// select Lot No.
162				[text_zones_page3 PushBackRect: 696 y1: 1028 x2: 1196 y2: 1128 ];
163
164				// select part of the plan inside the BUFFER ZONE
165				[text_zones_page3 PushBackRect: 428 y1: 1484 x2: 1784 y2: 2344 ];
166				[text_zones_page3 PushBackRect: 948 y1: 1288 x2: 1672 y2: 1476 ];
167
168			    // C) Run OCR on the .pdf with options
169
170				NSString * img = @"../../TestFiles/bc_environment_protection.tif";
171
172				[PTOCRModule ImageToPDF: doc src: img options: options];
173
174				// D) check the result
175
176                NSString * outputFile = @"../../TestFiles/Output/bc_environment_protection.pdf";
177				[doc SaveToFile: outputFile flags: e_ptremove_unused];
178
179				NSLog(@"Example 4: bc_environment_protection.tif\n");
180            }
181            @catch(NSException *e)	
182            {
183                NSLog(@"%@", e.reason);
184            }
185
186			//--------------------------------------------------------------------------------
187			// Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
188			// out special characters), and finally applying modified OCR JSON to the source PDF document 
189			@try 
190            {
191				// A) Open the .pdf document
192                
193				PTPDFDoc * doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/zero_value_test_no_text.pdf"];
194
195				// B) Run OCR on the .pdf with default English languages
196
197				NSString * json = [PTOCRModule GetOCRJsonFromPDF: doc options: nil];
198
199				// C) Post-processing step (whatever it might be), but we just print json here
200				
201			    NSLog(@"OCR result JSON: %@", json);
202
203				// D) Apply potentially modified OCR JSON to the PDF
204
205				[PTOCRModule ApplyOCRJsonToPDF: doc json: json];
206
207				// E) Check the result
208
209                NSString * outputFile = @"../../TestFiles/Output/zero_value_test_no_text.pdf";
210				[doc SaveToFile: outputFile flags: e_ptremove_unused];
211
212				NSLog(@"Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf\n");
213            }
214            @catch(NSException *e)	
215            {
216                NSLog(@"%@", e.reason);
217            }
218
219		//--------------------------------------------------------------------------------
220		// Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
221		@try 
222		{
223
224			// A) Setup empty destination doc
225
226			PTPDFDoc * doc = [[PTPDFDoc alloc] init];
227
228			// B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
229			// in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
230
231			NSString * img = @"../../TestFiles/physics.tif";
232
233			NSString * xml = [PTOCRModule GetOCRXmlFromImage: doc src: img options: nil];
234
235			// C) Post-processing step (whatever it might be), but we just print XML here
236
237			NSLog(@"OCR result XML: %@", xml);
238
239			// D) Apply potentially modified OCR XML to the PDF
240
241			[PTOCRModule ApplyOCRXmlToPDF: doc xml: xml];
242
243			// E) Check the result
244
245			NSString * outputFile = @"../../TestFiles/Output/physics.pdf";
246			[doc SaveToFile: outputFile flags: e_ptremove_unused];
247
248			NSLog(@"Example 6: extracting and applying OCR XML from physics.tif\n");
249
250		}
251		@catch(NSException *e)	
252        {
253            NSLog(@"%@", e.reason);
254        }
255
256        }
257        @catch(NSException *e)	
258        {
259            NSLog(@"%@", e.reason);
260        }
261
262        return 0;
263    }
264}
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

OCR