OCR

Sample C# code shows how to use the Apryse OCR module on scanned documents in multiple languages. The OCR module can make searchable PDFs and extract scanned text for further indexing. Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7using pdftron;
8using pdftron.Common;
9using pdftron.SDF;
10using pdftron.PDF;
11
12namespace OCRTestCS
13{
14
15 /// <summary>
16 //---------------------------------------------------------------------------------------
17 // The following sample illustrates how to use OCR module
18 //---------------------------------------------------------------------------------------
19 /// </summary>
20 class Class1
21 {
22 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
23 static Class1() {}
24
25 /// <summary>
26 /// The main entry point for the application.
27 /// </summary>
28 static void Main(string[] args)
29 {
30 // The first step in every application using PDFNet is to initialize the
31 // library and set the path to common PDF resources. The library is usually
32 // initialized only once, but calling Initialize() multiple times is also fine.
33 PDFNet.Initialize(PDFTronLicense.Key);
34
35 // Can optionally set path to the OCR module
36 PDFNet.AddResourceSearchPath("../../../../../Lib/");
37
38 // if the IRIS OCR module is available, will use that instead of the default
39 bool use_iris = OCRModule.IsIRISModuleAvailable();
40 if( !OCRModule.IsModuleAvailable() )
41 {
42 Console.WriteLine("");
43 Console.WriteLine("Unable to run OCRTest: Apryse SDK OCR module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The OCR module is an optional add-on, available for download");
46 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine("");
50 return;
51 }
52
53 // Relative path to the folder containing test files.
54 string input_path = "../../../../TestFiles/OCR/";
55 string output_path = "../../../../TestFiles/Output/";
56
57 //--------------------------------------------------------------------------------
58 // Example 1) Process image
59 try
60 {
61
62 // A) Setup empty destination doc
63 using (PDFDoc doc = new PDFDoc())
64 {
65 // B) Set English as the language of choice
66 OCROptions opts = new OCROptions();
67 if(use_iris) opts.SetOCREngine("iris");
68 opts.AddLang("eng");
69
70 // C) Run OCR on the .png with options
71 OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts);
72
73 // D) check the result
74 doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused);
75
76 Console.WriteLine("Example 1: psychomachia_excerpt.png");
77 }
78
79 }
80 catch (PDFNetException e)
81 {
82 Console.WriteLine(e.Message);
83 }
84
85 //--------------------------------------------------------------------------------
86 // Example 2) Process document using multiple languages
87 try
88 {
89
90 // A) Setup empty destination doc
91 using (PDFDoc doc = new PDFDoc())
92 {
93
94 // B) Setup options with multiple target languages, English will always be considered as secondary language
95 OCROptions opts = new OCROptions();
96 if(use_iris) opts.SetOCREngine("iris");
97 opts.AddLang("deu");
98 opts.AddLang("fra");
99 opts.AddLang("eng");
100
101 // C) Run OCR on the .jpg with options
102 OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts);
103
104 // D) check the result
105 doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused);
106
107 Console.WriteLine("Example 2: multi_lang.jpg");
108 }
109
110 }
111 catch (PDFNetException e)
112 {
113 Console.WriteLine(e.Message);
114 }
115
116 //--------------------------------------------------------------------------------
117 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
118 try
119 {
120
121 // A) Open the .pdf document
122 using (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf"))
123 {
124
125 // B) Setup options with a single language and an ignore zone
126 OCROptions opts = new OCROptions();
127 if(use_iris) opts.SetOCREngine("iris");
128 opts.AddLang("deu");
129
130 RectCollection ignoreZones = new RectCollection();
131 ignoreZones.AddRect(424, 163, 493, 730);
132 opts.AddIgnoreZonesForPage(ignoreZones, 1);
133
134 // C) Run OCR on the .pdf with options
135 OCRModule.ProcessPDF(doc, opts);
136
137 // D) check the result
138 doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused);
139
140 Console.WriteLine("Example 3: german_kids_song.pdf");
141 }
142
143 }
144 catch (PDFNetException e)
145 {
146 Console.WriteLine(e.Message);
147 }
148
149 //--------------------------------------------------------------------------------
150 // Example 4) Process multipage tiff with text/ignore zones specified for each page
151 try
152 {
153
154 // A) Setup empty destination doc
155 using (PDFDoc doc = new PDFDoc())
156 {
157
158 // B) Setup options with a single language plus text/ignore zones
159 OCROptions opts = new OCROptions();
160 if(use_iris) opts.SetOCREngine("iris");
161 opts.AddLang("eng");
162
163 RectCollection zones = new RectCollection();
164
165
166 // ignore signature box in the first 2 pages
167 zones.AddRect(1492, 56, 2236, 432);
168 opts.AddIgnoreZonesForPage(zones, 1);
169 zones.Clear();
170
171 zones.AddRect(1492, 56, 2236, 432);
172 opts.AddIgnoreZonesForPage(zones, 2);
173 zones.Clear();
174
175 // can use a combination of ignore and text boxes to focus on the page area of interest,
176 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
177 zones.AddRect(992, 1276, 1368, 1372);
178 opts.AddIgnoreZonesForPage(zones, 3);
179 zones.Clear();
180
181 // select horizontal BUFFER ZONE sign
182 zones.AddRect(900, 2384, 1236, 2480);
183 // select right vertical BUFFER ZONE sign
184 zones.AddRect(1960, 1976, 2016, 2296);
185 // select Lot No.
186 zones.AddRect(696, 1028, 1196, 1128);
187
188 // select part of the plan inside the BUFFER ZONE
189 zones.AddRect(428, 1484, 1784, 2344);
190 zones.AddRect(948, 1288, 1672, 1476);
191 opts.AddTextZonesForPage(zones, 3);
192
193 // C) Run OCR on the .pdf with options
194 OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts);
195
196 // D) check the result
197 doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused);
198
199 Console.WriteLine("Example 4: bc_environment_protection.tif");
200 }
201
202 }
203 catch (PDFNetException e)
204 {
205 Console.WriteLine(e.Message);
206 }
207
208 //--------------------------------------------------------------------------------
209 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
210 // out special characters), and finally applying modified OCR JSON to the source PDF document
211 try
212 {
213
214 // A) Open the .pdf document
215 using (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf"))
216 {
217
218 // B) set English language
219 OCROptions opts = new OCROptions();
220 if(use_iris) opts.SetOCREngine("iris");
221 opts.AddLang("eng");
222
223
224 // C) Run OCR on the .pdf
225 string json = OCRModule.GetOCRJsonFromPDF(doc, opts);
226
227 // D) Post-processing step (whatever it might be), but we just print JSON here
228 Console.WriteLine("Have OCR result JSON, re-applying to PDF");
229
230 // E) Apply potentially modified OCR JSON to the PDF
231 OCRModule.ApplyOCRJsonToPDF(doc, json);
232
233 // F) check the result
234 doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused);
235
236 Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
237 }
238
239 }
240 catch (PDFNetException e)
241 {
242 Console.WriteLine(e.Message);
243 }
244
245 //--------------------------------------------------------------------------------
246 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
247 try
248 {
249
250 // A) Setup empty destination doc
251 using (PDFDoc doc = new PDFDoc())
252 {
253
254 // B) set English language
255 OCROptions opts = new OCROptions();
256 if(use_iris) opts.SetOCREngine("iris");
257 opts.AddLang("eng");
258
259 // C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
260 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
261
262 string xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts);
263
264 // D) Post-processing step (whatever it might be), but we just print XML here
265 Console.WriteLine("Have OCR result XML, re-applying to PDF");
266
267 // E) Apply potentially modified OCR XML to the PDF
268 OCRModule.ApplyOCRXmlToPDF(doc, xml);
269
270 // F) check the result
271 doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused);
272
273 Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif");
274 }
275
276 }
277 catch (PDFNetException e)
278 {
279 Console.WriteLine(e.Message);
280 }
281
282 PDFNet.Terminate();
283 }
284
285 }
286}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales