OCR to search PDFs and Extract Text - Go Sample Code

Requirements
View Demo

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing.

Looking for OCR + WebViewer? Check out our OCR - Showcase Sample Code

Learn more about our Server SDK and OCR capabilities.

Implementation steps

To run this sample, you will need:

  1. Get started with Server SDK in your language/framework
  2. Download OCR Module
  3. Add the sample code provided below

To use this feature in production, your license key will need the OCR Package. Trial keys already include this package.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package main
7
8import (
9 "fmt"
10 . "github.com/pdftron/pdftron-go/v2"
11)
12
13// Relative path to the folder containing test files.
14var inputPath = "../../TestFiles/OCR/"
15var outputPath = "../../TestFiles/Output/"
16
17// ---------------------------------------------------------------------------------------
18// The following sample illustrates how to use OCR module
19// --------------------------------------------------------------------------------------
20
21func main(){
22
23 // The first step in every application using PDFNet is to initialize the
24 // library and set the path to common PDF resources. The library is usually
25 // initialized only once, but calling Initialize() multiple times is also fine.
26 var licenseKey = "YOUR_LICENSE_KEY"
27 PDFNetInitialize(licenseKey)
28
29 var iris_installed = true // Set to true if the IRIS OCR module is installed and you wish to use it
30
31 // The location of the OCR Module
32 if iris_installed {
33 PDFNetAddResourceSearchPath("../../IRISOCRModuleWindows/Lib")
34 } else {
35 PDFNetAddResourceSearchPath("../../OCRModuleWindows/Lib")
36 }
37
38 var use_iris = OCRModuleIsIRISModuleAvailable()
39
40 if ! OCRModuleIsModuleAvailable(){
41 fmt.Println("Unable to run OCRTest: PDFTron SDK OCR module not available.\n" +
42 "---------------------------------------------------------------\n" +
43 "The OCR module is an optional add-on, available for download\n" +
44 "at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this\n" +
45 "module, ensure that the SDK is able to find the required files\n" +
46 "using the PDFNetAddResourceSearchPath() function.")
47 return
48 }
49
50 // Example 1) Process image without specifying options, default language - English - is used
51 // --------------------------------------------------------------------------------
52
53 // A) Setup empty destination doc
54 doc := NewPDFDoc()
55
56 // B) Run OCR on the .png with options
57 ocrOpts := NewOCROptions()
58 if use_iris {
59 ocrOpts.SetOCREngine("iris")
60 }
61
62 OCRModuleImageToPDF(doc, inputPath + "psychomachia_excerpt.png", ocrOpts)
63
64 // C) Check the result
65 doc.Save(outputPath + "psychomachia_excerpt.pdf", uint(0))
66 fmt.Println("Example 1: psychomachia_excerpt.png")
67
68 // Example 2) Process document using multiple languages
69 // --------------------------------------------------------------------------------
70
71 // A) Setup empty destination doc
72 doc = NewPDFDoc()
73
74 // B) Setup options with multiple target languages, English will always be considered as secondary language
75 opts := NewOCROptions()
76 if use_iris {
77 opts.SetOCREngine("iris")
78 }
79 opts.AddLang("deu")
80 opts.AddLang("fra")
81 opts.AddLang("eng")
82
83 // C) Run OCR on the .jpg with options
84 OCRModuleImageToPDF(doc, inputPath + "multi_lang.jpg", opts)
85
86 // D) Check the result
87 doc.Save(outputPath + "multi_lang.pdf", uint(0))
88 fmt.Println("Example 2: multi_lang.jpg")
89
90 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
91 // --------------------------------------------------------------------------------
92
93 // A) Open the .pdf document
94 doc = NewPDFDoc(inputPath + "german_kids_song.pdf")
95
96 // B) Setup options with a single language and an ignore zone
97 opts = NewOCROptions()
98 if use_iris {
99 opts.SetOCREngine("iris")
100 }
101 opts.AddLang("deu")
102
103 ignoreZones := NewRectCollection()
104 ignoreZones.AddRect(NewRect(424.0, 163.0, 493.0, 730.0))
105 opts.AddIgnoreZonesForPage(ignoreZones, 1)
106
107 // C) Run OCR on the .pdf with options
108 OCRModuleProcessPDF(doc, opts)
109
110 // D) check the result
111 doc.Save(outputPath + "german_kids_song.pdf", uint(0))
112 fmt.Println("Example 3: german_kids_song.pdf")
113
114 // Example 4) Process multi-page tiff with text/ignore zones specified for each page,
115 // --------------------------------------------------------------------------------
116
117 // A) Setup empty destination doc
118 doc = NewPDFDoc()
119
120 // B) Setup options with a single language plus text/ignore zones
121 opts = NewOCROptions()
122 if use_iris {
123 opts.SetOCREngine("iris")
124 }
125 opts.AddLang("eng")
126
127 ignoreZones = NewRectCollection()
128
129 // ignore signature box in the first 2 pages
130 ignoreZones.AddRect(NewRect(1492.0, 56.0, 2236.0, 432.0))
131 opts.AddIgnoreZonesForPage(ignoreZones, 1)
132 opts.AddIgnoreZonesForPage(ignoreZones, 2)
133
134 // can use a combination of ignore and text boxes to focus on the page area of interest,
135 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
136 ignoreZones.Clear()
137 ignoreZones.AddRect(NewRect(992.0, 1276.0, 1368.0, 1372.0))
138 opts.AddIgnoreZonesForPage(ignoreZones, 3)
139
140 textZones := NewRectCollection()
141 // we only have text zones selected in page 3
142 // select horizontal BUFFER ZONE sign
143 textZones.AddRect(NewRect(900.0, 2384.0, 1236.0, 2480.0))
144 // select right vertical BUFFER ZONE sign
145 textZones.AddRect(NewRect(1960.0, 1976.0, 2016.0, 2296.0))
146 // select Lot No.
147 textZones.AddRect(NewRect(696.0, 1028.0, 1196.0, 1128.0))
148
149 // select part of the plan inside the BUFFER ZONE
150 textZones.AddRect(NewRect(428.0, 1484.0, 1784.0, 2344.0))
151 textZones.AddRect(NewRect(948.0, 1288.0, 1672.0, 1476.0))
152 opts.AddTextZonesForPage(textZones, 3)
153
154 // C) Run OCR on the .pdf with options
155 OCRModuleImageToPDF(doc, inputPath + "bc_environment_protection.tif", opts)
156
157 // D) check the result
158 doc.Save(outputPath + "bc_environment_protection.pdf", uint(0))
159 fmt.Println("Example 4: bc_environment_protection.tif")
160
161 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
162 // (e.g., removing words not in the dictionary or filtering out special characters),
163 // and finally applying modified OCR JSON to the source PDF document
164 // --------------------------------------------------------------------------------
165
166 // A) Open the .pdf document
167 doc = NewPDFDoc(inputPath + "zero_value_test_no_text.pdf")
168
169 // B) Run OCR on the .pdf with English language
170
171 opts = NewOCROptions()
172 if use_iris {
173 opts.SetOCREngine("iris")
174 }
175 opts.AddLang("eng")
176
177 // C) Run OCR on the .pdf
178 json := OCRModuleGetOCRJsonFromPDF(doc, opts)
179
180 // D) Post-processing step (whatever it might be)
181 fmt.Println("Have OCR result JSON, re-applying to PDF")
182
183 // E) Apply potentially modified OCR JSON to the PDF
184 OCRModuleApplyOCRJsonToPDF(doc, json)
185
186 // F) Check the result
187 doc.Save(outputPath + "zero_value_test_no_text.pdf", uint(0))
188 fmt.Println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
189
190 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
191 // similar to the one used by TextExtractor
192 // --------------------------------------------------------------------------------
193
194 // A) Setup empty destination doc
195 doc = NewPDFDoc()
196
197 // B) Run OCR on the .tif with English language, extracting OCR results in XML format. Note that
198 // in the process we convert the source image into PDF.
199 // We reuse this PDF document later to add hidden text layer to it.
200 opts = NewOCROptions()
201 if use_iris {
202 opts.SetOCREngine("iris")
203 }
204 opts.AddLang("eng")
205 xml := OCRModuleGetOCRXmlFromImage(doc, inputPath + "physics.tif", opts)
206
207 // C) Post-processing step (whatever it might be)
208 fmt.Println("Have OCR result XML, re-applying to PDF")
209
210 // D) Apply potentially modified OCR XML to the PDF
211 OCRModuleApplyOCRXmlToPDF(doc, xml)
212
213 // E) Check the result
214 doc.Save(outputPath + "physics.pdf", uint(0))
215 fmt.Println("Example 6: extracting and applying OCR XML from physics.tif")
216}
217

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales