OCR to search PDFs and Extract Text - Python Sample Code

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing.

To run this sample, you will need:

  1. Get started with Server SDK in your language/framework
  2. Download an OCR Module

Learn more about our Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from apryse_sdk import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14# Relative path to the folder containing test files.
15input_path = "../../TestFiles/OCR/"
16output_path = "../../TestFiles/Output/"
17
18# ---------------------------------------------------------------------------------------
19# The following sample illustrates how to use OCR module
20# --------------------------------------------------------------------------------------
21
22def main():
23
24 # The first step in every application using PDFNet is to initialize the
25 # library and set the path to common PDF resources. The library is usually
26 # initialized only once, but calling Initialize() multiple times is also fine.
27 PDFNet.Initialize(LicenseKey)
28
29 iris_installed = True # Set to True if the IRIS OCR module is installed and you wish to use it
30
31 # The location of the OCR Module
32 if iris_installed:
33 PDFNet.AddResourceSearchPath("../../../IRISOCRModuleWindows/Lib/")
34 else:
35 PDFNet.AddResourceSearchPath("../../../OCRModuleWindows/Lib/")
36
37 use_iris = OCRModule.IsIRISModuleAvailable()
38
39 if not OCRModule.IsModuleAvailable():
40
41 print("""
42 Unable to run OCRTest: PDFTron SDK OCR module not available.
43 ---------------------------------------------------------------
44 The OCR module is an optional add-on, available for download
45 at https://dev.apryse.com/. If you have already downloaded this
46 module, ensure that the SDK is able to find the required files
47 using the PDFNet::AddResourceSearchPath() function.""")
48
49 else:
50
51 # Example 1) Process image
52 # --------------------------------------------------------------------------------
53
54 # A) Setup empty destination doc
55 doc = PDFDoc()
56
57 # B) Set English as the language of choice
58 opts = OCROptions()
59 if use_iris: opts.SetOCREngine("iris")
60 opts.AddLang("eng")
61
62 # C) Run OCR on the .png with options
63 OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts)
64
65 # D) Check the result
66 doc.Save(output_path + "psychomachia_excerpt.pdf", 0)
67
68 print("Example 1: psychomachia_excerpt.png")
69
70 # Example 2) Process document using multiple languages
71 # --------------------------------------------------------------------------------
72
73 # A) Setup empty destination doc
74 doc = PDFDoc()
75
76 # B) Setup options with multiple target languages, English will always be considered as secondary language
77 opts = OCROptions()
78 if use_iris: opts.SetOCREngine("iris")
79 opts.AddLang("deu")
80 opts.AddLang("fra")
81 opts.AddLang("eng")
82
83 # C) Run OCR on the .jpg with options
84 OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)
85
86 # D) Check the result
87 doc.Save(output_path + "multi_lang.pdf", 0)
88
89 print("Example 2: multi_lang.jpg")
90
91 # Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
92 # --------------------------------------------------------------------------------
93
94 # A) Open the .pdf document
95 doc = PDFDoc(input_path + "german_kids_song.pdf")
96
97 # B) Setup options with a single language and an ignore zone
98 opts = OCROptions()
99 if use_iris: opts.SetOCREngine("iris")
100 opts.AddLang("deu")
101
102 ignore_zones = RectCollection()
103 ignore_zones.AddRect(Rect(424, 163, 493, 730))
104 opts.AddIgnoreZonesForPage(ignore_zones, 1)
105
106 # C) Run OCR on the .pdf with options
107 OCRModule.ProcessPDF(doc, opts)
108
109 # D) check the result
110 doc.Save(output_path + "german_kids_song.pdf", 0)
111
112 print("Example 3: german_kids_song.pdf")
113
114 # Example 4) Process multi-page tiff with text/ignore zones specified for each page,
115 # --------------------------------------------------------------------------------
116
117 # A) Setup empty destination doc
118
119 doc = PDFDoc()
120 # B) Setup options with a single language plus text/ignore zones
121
122 opts = OCROptions()
123 if use_iris: opts.SetOCREngine("iris")
124 opts.AddLang("eng")
125
126 ignore_zones = RectCollection()
127
128 # ignore signature box in the first 2 pages
129 ignore_zones.AddRect(Rect(1492, 56, 2236, 432))
130 opts.AddIgnoreZonesForPage(ignore_zones, 1)
131 opts.AddIgnoreZonesForPage(ignore_zones, 2)
132
133 # can use a combination of ignore and text boxes to focus on the page area of interest,
134 # as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
135 ignore_zones.Clear()
136 ignore_zones.AddRect(Rect(992, 1276, 1368, 1372))
137 opts.AddIgnoreZonesForPage(ignore_zones, 3)
138
139 text_zones = RectCollection()
140 # we only have text zones selected in page 3
141
142 # select horizontal BUFFER ZONE sign
143 text_zones.AddRect(Rect(900, 2384, 1236, 2480))
144
145 # select right vertical BUFFER ZONE sign
146 text_zones.AddRect(Rect(1960, 1976, 2016, 2296))
147 # select Lot No.
148 text_zones.AddRect(Rect(696, 1028, 1196, 1128))
149
150 # select part of the plan inside the BUFFER ZONE
151 text_zones.AddRect(Rect(428, 1484, 1784, 2344))
152 text_zones.AddRect(Rect(948, 1288, 1672, 1476))
153 opts.AddTextZonesForPage(text_zones, 3)
154
155 # C) Run OCR on the .pdf with options
156 OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)
157
158 # D) check the result
159 doc.Save(output_path + "bc_environment_protection.pdf", 0)
160
161 print("Example 4: bc_environment_protection.tif")
162
163 # Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
164 # (e.g., removing words not in the dictionary or filtering special
165 # out special characters), and finally applying modified OCR JSON to the source PDF document
166 # --------------------------------------------------------------------------------
167
168 # A) Open the .pdf document
169 doc = PDFDoc(input_path + "zero_value_test_no_text.pdf")
170
171 # B) set English language
172 opts = OCROptions()
173 if use_iris: opts.SetOCREngine("iris")
174 opts.AddLang("eng")
175
176 # C) Run OCR on the .pdf
177 json = OCRModule.GetOCRJsonFromPDF(doc, opts)
178
179 # D) Post-processing step (whatever it might be)
180 print("Have OCR result JSON, re-applying to PDF")
181
182 # E) Apply potentially modified OCR JSON to the PDF
183 OCRModule.ApplyOCRJsonToPDF(doc, json)
184
185 # F) Check the result
186 doc.Save(output_path + "zero_value_test_no_text.pdf", 0)
187
188 print("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
189
190 # Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
191 # similar to the one used by TextExtractor
192 # --------------------------------------------------------------------------------
193
194 # A) Setup empty destination doc
195 doc = PDFDoc()
196
197 # B) set English language
198 opts = OCROptions()
199 if use_iris: opts.SetOCREngine("iris")
200 opts.AddLang("eng")
201
202 # C) Run OCR on the .tif with English language, extracting OCR results in XML format. Note that
203 # in the process we convert the source image into PDF.
204 # We reuse this PDF document later to add hidden text layer to it.
205
206 xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts)
207
208 # D) Post-processing step (whatever it might be)
209 print("Have OCR result XML, re-applying to PDF")
210
211 # E) Apply potentially modified OCR XML to the PDF
212 OCRModule.ApplyOCRXmlToPDF(doc, xml)
213
214 # F) Check the result
215 doc.Save(output_path + "physics.pdf", 0)
216
217 print("Example 6: extracting and applying OCR XML from physics.tif")
218
219 PDFNet.Terminate()
220
221if __name__ == '__main__':
222 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales