OCR to search PDFs and Extract Text - C++ Sample Code

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing.

To run this sample, you will need:

  1. Get started with Server SDK in your language/framework
  2. Download an OCR Module

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5#include <PDF/PDFNet.h>
6#include <PDF/PDFDoc.h>
7#include <PDF/OCRModule.h>
8#include <PDF/OCROptions.h>
9#include <SDF/Obj.h>
10#include <iostream>
11#include "../../LicenseKey/CPP/LicenseKey.h"
12
13using namespace std;
14using namespace pdftron;
15using namespace PDF;
16using namespace SDF;
17
18//---------------------------------------------------------------------------------------
19// The following sample illustrates how to use OCR module
20//---------------------------------------------------------------------------------------
21int main(int argc, char *argv[])
22{
23 try
24 {
25 // The first step in every application using PDFNet is to initialize the
26 // library and set the path to common PDF resources. The library is usually
27 // initialized only once, but calling Initialize() multiple times is also fine.
28 PDFNet::Initialize(LicenseKey);
29 // The location of the OCR Module
30 PDFNet::AddResourceSearchPath("../../../Lib/");
31
32 // if the IRIS OCR module is available, will use that instead of the default
33 const bool use_iris = OCRModule::IsIRISModuleAvailable();
34 if(!OCRModule::IsModuleAvailable())
35 {
36 cout << endl;
37 cout << "Unable to run OCRTest: Apryse SDK OCR module not available." << endl;
38 cout << "---------------------------------------------------------------" << endl;
39 cout << "The OCR module is an optional add-on, available for download" << endl;
40 cout << "at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this" << endl;
41 cout << "module, ensure that the SDK is able to find the required files" << endl;
42 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
43 return 0;
44 }
45
46 // Relative path to the folder containing test files.
47 string input_path = "../../TestFiles/OCR/";
48 string output_path = "../../TestFiles/Output/";
49
50
51 //--------------------------------------------------------------------------------
52 // Example 1) Process image without specifying options, default language - English - is used
53 try
54 {
55
56 // A) Setup empty destination doc
57
58 PDFDoc doc;
59
60 // B) Run OCR on the .png without options
61
62 OCROptions opts;
63 if(use_iris) opts.SetOCREngine("iris");
64 OCRModule::ImageToPDF(doc, input_path + "psychomachia_excerpt.png", &opts);
65
66 // C) check the result
67
68 doc.Save(output_path + "psychomachia_excerpt.pdf", 0, 0);
69
70 cout << "Example 1: psychomachia_excerpt.png" << endl;
71
72 }
73 catch(Common::Exception& e)
74 {
75 cout << e << endl;
76 }
77 catch(...)
78 {
79 cout << "Unknown Exception" << endl;
80 }
81
82 //--------------------------------------------------------------------------------
83 // Example 2) Process document using multiple languages
84 try
85 {
86 // A) Setup empty destination doc
87
88 PDFDoc doc;
89
90 // B) Setup options with multiple target languages, English will always be considered as secondary language
91
92 OCROptions opts;
93 if(use_iris) opts.SetOCREngine("iris");
94 opts.AddLang("deu");
95 opts.AddLang("fra");
96 opts.AddLang("eng");
97
98 // C) Run OCR on the .jpg with options
99
100 OCRModule::ImageToPDF(doc, input_path + "multi_lang.jpg", &opts);
101
102 // D) check the result
103
104 doc.Save(output_path + "multi_lang.pdf", 0, 0);
105
106 cout << "Example 2: multi_lang.jpg" << endl;
107
108 }
109 catch (Common::Exception& e)
110 {
111 cout << e << endl;
112 }
113 catch (...)
114 {
115 cout << "Unknown Exception" << endl;
116 }
117
118 //--------------------------------------------------------------------------------
119 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
120 try
121 {
122 // A) Open the .pdf document
123
124 PDFDoc doc((input_path + "german_kids_song.pdf").c_str());
125
126 // B) Setup options with a single language and an ignore zone
127
128 OCROptions opts;
129 if(use_iris) opts.SetOCREngine("iris");
130 opts.AddLang("deu");
131
132 RectCollection ignore_zones;
133 ignore_zones.AddRect(424, 163, 493, 730);
134 opts.AddIgnoreZonesForPage(ignore_zones, 1);
135
136 // C) Run OCR on the .pdf with options
137
138 OCRModule::ProcessPDF(doc, &opts);
139
140 // D) check the result
141
142 PDFDoc doc_out(doc);
143 doc_out.Save(output_path + "german_kids_song.pdf", 0, 0);
144
145 cout << "Example 3: german_kids_song.pdf" << endl;
146 }
147 catch (Common::Exception& e)
148 {
149 cout << e << endl;
150 }
151 catch (...)
152 {
153 cout << "Unknown Exception" << endl;
154 }
155
156 //--------------------------------------------------------------------------------
157 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
158 try
159 {
160 // A) Setup empty destination doc
161
162 PDFDoc doc;
163
164 // B) Setup options with a single language plus text/ignore zones
165
166 OCROptions opts;
167 if(use_iris) opts.SetOCREngine("iris");
168 opts.AddLang("eng");
169
170 RectCollection ignore_zones;
171 // ignore signature box in the first 2 pages
172 ignore_zones.AddRect(1492, 56, 2236, 432);
173 opts.AddIgnoreZonesForPage(ignore_zones, 1);
174 opts.AddIgnoreZonesForPage(ignore_zones, 2);
175
176 // can use a combination of ignore and text boxes to focus on the page area of interest,
177 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
178 ignore_zones.Clear();
179 ignore_zones.AddRect(992, 1276, 1368, 1372);
180 opts.AddIgnoreZonesForPage(ignore_zones, 3);
181
182 RectCollection text_zones;
183 // we only have text zones selected in page 3
184
185 // select horizontal BUFFER ZONE sign
186 text_zones.AddRect(900, 2384, 1236, 2480);
187 // select right vertical BUFFER ZONE sign
188 text_zones.AddRect(1960, 1976, 2016, 2296);
189 // select Lot No.
190 text_zones.AddRect(696, 1028, 1196, 1128);
191
192 // select part of the plan inside the BUFFER ZONE
193 text_zones.AddRect(428, 1484, 1784, 2344);
194 text_zones.AddRect(948, 1288, 1672, 1476);
195 opts.AddTextZonesForPage(text_zones, 3);
196
197 // C) Run OCR on the .tif with options
198
199 OCRModule::ImageToPDF(doc, input_path + "bc_environment_protection.tif", &opts);
200
201 // D) check the result
202
203 doc.Save(output_path + "bc_environment_protection.pdf", 0, 0);
204
205 cout << "Example 4: bc_environment_protection.tif" << endl;
206
207 }
208 catch (Common::Exception& e)
209 {
210 cout << e << endl;
211 }
212 catch (...)
213 {
214 cout << "Unknown Exception" << endl;
215 }
216
217 //--------------------------------------------------------------------------------
218 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
219 // out special characters), and finally applying modified OCR JSON to the source PDF document
220 try
221 {
222
223 // A) Open the .pdf document
224
225 PDFDoc doc((input_path + "zero_value_test_no_text.pdf").c_str());
226
227 // B) Run OCR on the .pdf with default English language
228 OCROptions opts;
229 if(use_iris) opts.SetOCREngine("iris");
230
231 UString json = OCRModule::GetOCRJsonFromPDF(doc, &opts);
232
233 // C) Post-processing step (whatever it might be)
234
235 cout << "Have OCR result JSON, re-applying to PDF " << endl;
236
237 // D) Apply potentially modified OCR JSON to the PDF
238
239 OCRModule::ApplyOCRJsonToPDF(doc, json);
240
241 // E) Check the result
242
243 PDFDoc doc_out(doc);
244 doc_out.Save(output_path + "zero_value_test_no_text.pdf", 0, 0);
245
246 cout << "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf" << endl;
247
248 }
249 catch (Common::Exception& e)
250 {
251 cout << e << endl;
252 }
253 catch (...)
254 {
255 cout << "Unknown Exception" << endl;
256 }
257
258 //--------------------------------------------------------------------------------
259 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
260 try
261 {
262
263 // A) Setup empty destination doc
264
265 PDFDoc doc;
266
267 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
268 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
269
270 OCROptions opts;
271 if(use_iris) opts.SetOCREngine("iris");
272 UString xml = OCRModule::GetOCRXmlFromImage(doc, input_path + "physics.tif", NULL);
273
274 // C) Post-processing step (whatever it might be)
275
276 cout << "Have OCR result XML, re-applying to PDF" << endl;
277
278 // D) Apply potentially modified OCR XML to the PDF
279
280 OCRModule::ApplyOCRXmlToPDF(doc, xml);
281
282 // E) Check the result
283
284 PDFDoc doc_out(doc);
285 doc_out.Save(output_path + "physics.pdf", 0, 0);
286
287 cout << "Example 6: extracting and applying OCR XML from physics.tif" << endl;
288
289 }
290 catch (Common::Exception& e)
291 {
292 cout << e << endl;
293 }
294 catch (...)
295 {
296 cout << "Unknown Exception" << endl;
297 }
298
299 cout << "Done." << endl;
300
301 PDFNet::Terminate();
302 }
303 catch(Common::Exception& e)
304 {
305 cout << e << endl;
306 }
307 catch (...) {
308 cout << "Unknown Exception" << endl;
309 }
310
311 return 0;
312}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales