OCR

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing. Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using pdftron;
8using pdftron.Common;
9using pdftron.SDF;
10using pdftron.PDF;
11
12namespace OCRTestCS
13{
14
15 /// <summary>
16 //---------------------------------------------------------------------------------------
17 // The following sample illustrates how to use OCR module
18 //---------------------------------------------------------------------------------------
19 /// </summary>
20 class Class1
21 {
22 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
23 static Class1() {}
24
25 /// <summary>
26 /// The main entry point for the application.
27 /// </summary>
28 static void Main(string[] args)
29 {
30 // The first step in every application using PDFNet is to initialize the
31 // library and set the path to common PDF resources. The library is usually
32 // initialized only once, but calling Initialize() multiple times is also fine.
33 PDFNet.Initialize(PDFTronLicense.Key);
34
35 // Can optionally set path to the OCR module
36 PDFNet.AddResourceSearchPath("../../../../../Lib/");
37
38 // if the IRIS OCR module is available, will use that instead of the default
39 bool use_iris = OCRModule.IsIRISModuleAvailable();
40 if( !OCRModule.IsModuleAvailable() )
41 {
42 Console.WriteLine("");
43 Console.WriteLine("Unable to run OCRTest: Apryse SDK OCR module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The OCR module is an optional add-on, available for download");
46 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine("");
50 return;
51 }
52
53 // Relative path to the folder containing test files.
54 string input_path = "../../../../TestFiles/OCR/";
55 string output_path = "../../../../TestFiles/Output/";
56
57 //--------------------------------------------------------------------------------
58 // Example 1) Process image
59 try
60 {
61
62 // A) Setup empty destination doc
63 using (PDFDoc doc = new PDFDoc())
64 {
65 // B) Set English as the language of choice
66 OCROptions opts = new OCROptions();
67 if(use_iris) opts.SetOCREngine("iris");
68 opts.AddLang("eng");
69
70 // C) Run OCR on the .png with options
71 OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts);
72
73 // D) check the result
74 doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused);
75
76 Console.WriteLine("Example 1: psychomachia_excerpt.png");
77 }
78
79 }
80 catch (PDFNetException e)
81 {
82 Console.WriteLine(e.Message);
83 }
84
85 //--------------------------------------------------------------------------------
86 // Example 2) Process document using multiple languages
87 try
88 {
89
90 // A) Setup empty destination doc
91 using (PDFDoc doc = new PDFDoc())
92 {
93
94 // B) Setup options with multiple target languages, English will always be considered as secondary language
95 OCROptions opts = new OCROptions();
96 if(use_iris) opts.SetOCREngine("iris");
97 opts.AddLang("deu");
98 opts.AddLang("fra");
99 opts.AddLang("eng");
100
101 // C) Run OCR on the .jpg with options
102 OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts);
103
104 // D) check the result
105 doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused);
106
107 Console.WriteLine("Example 2: multi_lang.jpg");
108 }
109
110 }
111 catch (PDFNetException e)
112 {
113 Console.WriteLine(e.Message);
114 }
115
116 //--------------------------------------------------------------------------------
117 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
118 try
119 {
120
121 // A) Open the .pdf document
122 using (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf"))
123 {
124
125 // B) Setup options with a single language and an ignore zone
126 OCROptions opts = new OCROptions();
127 if(use_iris) opts.SetOCREngine("iris");
128 opts.AddLang("deu");
129
130 RectCollection ignoreZones = new RectCollection();
131 ignoreZones.AddRect(424, 163, 493, 730);
132 opts.AddIgnoreZonesForPage(ignoreZones, 1);
133
134 // C) Run OCR on the .pdf with options
135 OCRModule.ProcessPDF(doc, opts);
136
137 // D) check the result
138 doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused);
139
140 Console.WriteLine("Example 3: german_kids_song.pdf");
141 }
142
143 }
144 catch (PDFNetException e)
145 {
146 Console.WriteLine(e.Message);
147 }
148
149 //--------------------------------------------------------------------------------
150 // Example 4) Process multipage tiff with text/ignore zones specified for each page
151 try
152 {
153
154 // A) Setup empty destination doc
155 using (PDFDoc doc = new PDFDoc())
156 {
157
158 // B) Setup options with a single language plus text/ignore zones
159 OCROptions opts = new OCROptions();
160 if(use_iris) opts.SetOCREngine("iris");
161 opts.AddLang("eng");
162
163 RectCollection zones = new RectCollection();
164
165
166 // ignore signature box in the first 2 pages
167 zones.AddRect(1492, 56, 2236, 432);
168 opts.AddIgnoreZonesForPage(zones, 1);
169 zones.Clear();
170
171 zones.AddRect(1492, 56, 2236, 432);
172 opts.AddIgnoreZonesForPage(zones, 2);
173 zones.Clear();
174
175 // can use a combination of ignore and text boxes to focus on the page area of interest,
176 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
177 zones.AddRect(992, 1276, 1368, 1372);
178 opts.AddIgnoreZonesForPage(zones, 3);
179 zones.Clear();
180
181 // select horizontal BUFFER ZONE sign
182 zones.AddRect(900, 2384, 1236, 2480);
183 // select right vertical BUFFER ZONE sign
184 zones.AddRect(1960, 1976, 2016, 2296);
185 // select Lot No.
186 zones.AddRect(696, 1028, 1196, 1128);
187
188 // select part of the plan inside the BUFFER ZONE
189 zones.AddRect(428, 1484, 1784, 2344);
190 zones.AddRect(948, 1288, 1672, 1476);
191 opts.AddTextZonesForPage(zones, 3);
192
193 // C) Run OCR on the .pdf with options
194 OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts);
195
196 // D) check the result
197 doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused);
198
199 Console.WriteLine("Example 4: bc_environment_protection.tif");
200 }
201
202 }
203 catch (PDFNetException e)
204 {
205 Console.WriteLine(e.Message);
206 }
207
208 //--------------------------------------------------------------------------------
209 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
210 // out special characters), and finally applying modified OCR JSON to the source PDF document
211 try
212 {
213
214 // A) Open the .pdf document
215 using (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf"))
216 {
217
218 // B) set English language
219 OCROptions opts = new OCROptions();
220 if(use_iris) opts.SetOCREngine("iris");
221 opts.AddLang("eng");
222
223
224 // C) Run OCR on the .pdf
225 string json = OCRModule.GetOCRJsonFromPDF(doc, opts);
226
227 // D) Post-processing step (whatever it might be), but we just print JSON here
228 Console.WriteLine("Have OCR result JSON, re-applying to PDF");
229
230 // E) Apply potentially modified OCR JSON to the PDF
231 OCRModule.ApplyOCRJsonToPDF(doc, json);
232
233 // F) check the result
234 doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused);
235
236 Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
237 }
238
239 }
240 catch (PDFNetException e)
241 {
242 Console.WriteLine(e.Message);
243 }
244
245 //--------------------------------------------------------------------------------
246 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
247 try
248 {
249
250 // A) Setup empty destination doc
251 using (PDFDoc doc = new PDFDoc())
252 {
253
254 // B) set English language
255 OCROptions opts = new OCROptions();
256 if(use_iris) opts.SetOCREngine("iris");
257 opts.AddLang("eng");
258
259 // C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
260 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
261
262 string xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts);
263
264 // D) Post-processing step (whatever it might be), but we just print XML here
265 Console.WriteLine("Have OCR result XML, re-applying to PDF");
266
267 // E) Apply potentially modified OCR XML to the PDF
268 OCRModule.ApplyOCRXmlToPDF(doc, xml);
269
270 // F) check the result
271 doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused);
272
273 Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif");
274 }
275
276 }
277 catch (PDFNetException e)
278 {
279 Console.WriteLine(e.Message);
280 }
281
282 PDFNet.Terminate();
283 }
284
285 }
286}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales