OCR to search PDFs and Extract Text

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing. Learn more about our Server SDK.

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/OCR/";
12$output_path = getcwd()."/../../TestFiles/Output/";
13
14//---------------------------------------------------------------------------------------
15// The following sample illustrates how to use OCR module
16//---------------------------------------------------------------------------------------
17
18 // The first step in every application using PDFNet is to initialize the
19 // library and set the path to common PDF resources. The library is usually
20 // initialized only once, but calling Initialize() multiple times is also fine.
21 PDFNet::Initialize($LicenseKey);
22 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
23
24 // The location of the OCR Module
25 PDFNet::AddResourceSearchPath("../../../Lib/");
26
27 // If the IRIS OCR module is available, will use that instead of the default
28 $use_iris = OCRModule::IsIRISModuleAvailable();
29 if(!OCRModule::IsModuleAvailable()) {
30 echo "Unable to run OCRTest: PDFTron SDK OCR module not available.\n
31 ---------------------------------------------------------------\n
32 The OCR module is an optional add-on, available for download\n
33 at https://dev.apryse.com/. If you have already downloaded this\n
34 module, ensure that the SDK is able to find the required files\n
35 using the PDFNet::AddResourceSearchPath() function.\n";
36 } else
37 {
38 //--------------------------------------------------------------------------------
39 // Example 1) Process image
40 // A) Setup empty destination doc
41
42 $doc = new PDFDoc();
43
44 // B) Use the IRIS OCR engine if available
45
46 $opts = new OCROptions();
47 if ($use_iris) {
48 $opts->SetOCREngine("iris");
49 }
50
51 // C) Run OCR on the .png with options
52 OCRModule::ImageToPDF($doc, $input_path."psychomachia_excerpt.png", $opts);
53
54 // D) Check the result
55
56 $doc->Save($output_path."psychomachia_excerpt.pdf", 0);
57
58 echo "Example 1: psychomachia_excerpt.png \n";
59
60
61 //--------------------------------------------------------------------------------
62 // Example 2) Process document using multiple languages
63
64 // A) Setup empty destination doc
65
66 $doc = new PDFDoc();
67
68 // B) Setup options with multiple target languages, English will always be considered as secondary language
69
70 $opts = new OCROptions();
71 if ($use_iris) {
72 $opts->SetOCREngine("iris");
73 }
74 $opts->AddLang("deu");
75 $opts->AddLang("fra");
76 $opts->AddLang("eng");
77
78 // B) Run OCR on the .png with options
79
80 OCRModule::ImageToPDF($doc, $input_path."multi_lang.jpg", $opts);
81
82 // C) check the result
83
84 $doc->Save($output_path."multi_lang.pdf", 0);
85
86 echo "Example 2: multi_lang.jpg \n";
87
88
89 //--------------------------------------------------------------------------------
90 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
91
92 // A) Open the .pdf document
93
94 $doc = new PDFDoc($input_path."german_kids_song.pdf");
95
96 // B) Setup options with a single language and an ignore zone
97
98 $opts = new OCROptions();
99 if ($use_iris) {
100 $opts->SetOCREngine("iris");
101 }
102 $opts->AddLang("deu");
103
104 $ignore_zones = new RectCollection();
105 $rect = new Rect(424.0, 163.0, 493.0, 730.0);
106 $ignore_zones->AddRect($rect);
107 $opts->AddIgnoreZonesForPage($ignore_zones, 1);
108
109 // C) Run OCR on the .pdf with options
110
111 OCRModule::ProcessPDF($doc, $opts);
112
113 // D) check the result
114
115 $doc->Save($output_path."german_kids_song.pdf", 0);
116
117 echo "Example 3: german_kids_song.pdf \n";
118
119 //--------------------------------------------------------------------------------
120 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
121
122 // A) Setup empty destination doc
123
124 $doc = new PDFDoc();
125
126 // B) Setup options with a single language plus text/ignore zones
127
128 $opts = new OCROptions();
129 if ($use_iris) {
130 $opts->SetOCREngine("iris");
131 }
132 $opts->AddLang("eng");
133
134 $ignore_zones = new RectCollection();
135 // ignore signature box in the first 2 pages
136 $ignore_zones->AddRect(new Rect(1492.0, 56.0, 2236.0, 432.0));
137 $opts->AddIgnoreZonesForPage($ignore_zones, 1);
138 $opts->AddIgnoreZonesForPage($ignore_zones, 2);
139
140 // can use a combination of ignore and text boxes to focus on the page area of interest,
141 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
142 $ignore_zones->Clear();
143 $ignore_zones->AddRect(new Rect(992.0, 1276.0, 1368.0, 1372.0));
144 $opts->AddIgnoreZonesForPage($ignore_zones, 3);
145
146
147 $text_zones = new RectCollection();
148 // we only have text zones selected in page 3
149
150 // select horizontal BUFFER ZONE sign
151 $text_zones->AddRect(new Rect(900.0, 2384.0, 1236.0, 2480.0));
152 // select right vertical BUFFER ZONE sign
153 $text_zones->AddRect(new Rect(1960.0, 1976.0, 2016.0, 2296.0));
154 // select Lot No.
155 $text_zones->AddRect(new Rect(696.0, 1028.0, 1196.0, 1128.0));
156
157 // select part of the plan inside the BUFFER ZONE
158 $text_zones->AddRect(new Rect(428.0, 1484.0, 1784.0, 2344.0));
159 $text_zones->AddRect(new Rect(948.0, 1288.0, 1672.0, 1476.0));
160 $opts->AddTextZonesForPage($text_zones, 3);
161
162 // C) Run OCR on the .pdf with options
163
164 OCRModule::ImageToPDF($doc, $input_path."bc_environment_protection.tif", $opts);
165
166 // D) check the result
167
168 $doc->Save($output_path."bc_environment_protection.pdf", 0);
169
170 echo "Example 4: bc_environment_protection.tif \n";
171
172
173 //--------------------------------------------------------------------------------
174 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
175 // out special characters), and finally applying modified OCR JSON to the source PDF document
176 // A) Setup empty destination doc
177
178 $doc = new PDFDoc($input_path."zero_value_test_no_text.pdf");
179
180 // B) Use the IRIS OCR engine if available
181
182 $opts = new OCROptions();
183 if ($use_iris) {
184 $opts->SetOCREngine("iris");
185 }
186
187 // C) Run OCR on the .pdf with default English language
188
189 $json = OCRModule::GetOCRJsonFromPDF($doc, $opts);
190
191 // D) Post-processing step (whatever it might be)
192
193 echo "Have OCR result JSON, re-applying to PDF \n";
194
195 OCRModule::ApplyOCRJsonToPDF($doc, $json);
196
197 // E) check the result
198
199 $doc->Save($output_path."zero_value_test_no_text.pdf", 0);
200
201 echo "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf \n";
202
203
204 //--------------------------------------------------------------------------------
205 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
206
207 // A) Setup empty destination doc
208
209 $doc = new PDFDoc();
210
211 // B) Use the IRIS OCR engine if available
212
213 $opts = new OCROptions();
214 if ($use_iris) {
215 $opts->SetOCREngine("iris");
216 }
217
218 // C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
219 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
220
221 $xml = OCRModule::GetOCRXmlFromImage($doc, $input_path."physics.tif", $opts);
222
223 // D) Post-processing step (whatever it might be)
224
225 echo "Have OCR result XML, re-applying to PDF \n";
226
227 OCRModule::ApplyOCRXmlToPDF($doc, $xml);
228
229 // E) check the result
230
231 $doc->Save($output_path."physics.pdf", 0);
232
233 echo "Example 6: extracting and applying OCR XML from physics.tif \n";
234
235 echo "Done. \n";
236 }
237
238 PDFNet::Terminate();
239
240?>

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales