OCR to search PDFs and Extract Text - Go Sample Code

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing.

To run this sample, you will need:

  1. Get started with Server SDK in your language/framework
  2. Download an OCR Module

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package main
7
8import (
9 "fmt"
10 . "github.com/pdftron/pdftron-go/v2"
11)
12
13// Relative path to the folder containing test files.
14var inputPath = "../../TestFiles/OCR/"
15var outputPath = "../../TestFiles/Output/"
16
17// ---------------------------------------------------------------------------------------
18// The following sample illustrates how to use OCR module
19// --------------------------------------------------------------------------------------
20
21func main(){
22
23 // The first step in every application using PDFNet is to initialize the
24 // library and set the path to common PDF resources. The library is usually
25 // initialized only once, but calling Initialize() multiple times is also fine.
26 var licenseKey = "Insert commercial license key here after purchase, or demo key during evaluation"
27 PDFNetInitialize(licenseKey)
28
29 var iris_installed = true // Set to true if the IRIS OCR module is installed and you wish to use it
30
31 // The location of the OCR Module
32 if iris_installed {
33 PDFNetAddResourceSearchPath("../../IRISOCRModuleWindows/Lib")
34 } else {
35 PDFNetAddResourceSearchPath("../../OCRModuleWindows/Lib")
36 }
37
38 var use_iris = OCRModuleIsIRISModuleAvailable()
39
40 if ! OCRModuleIsModuleAvailable(){
41 fmt.Println("Unable to run OCRTest: PDFTron SDK OCR module not available.\n" +
42 "---------------------------------------------------------------\n" +
43 "The OCR module is an optional add-on, available for download\n" +
44 "at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this\n" +
45 "module, ensure that the SDK is able to find the required files\n" +
46 "using the PDFNetAddResourceSearchPath() function.")
47 return
48 }
49
50 // Example 1) Process image without specifying options, default language - English - is used
51 // --------------------------------------------------------------------------------
52
53 // A) Setup empty destination doc
54 doc := NewPDFDoc()
55
56 // B) Run OCR on the .png with options
57 ocrOpts := NewOCROptions()
58 if use_iris {
59 ocrOpts.SetOCREngine("iris")
60 }
61
62 OCRModuleImageToPDF(doc, inputPath + "psychomachia_excerpt.png", ocrOpts)
63
64 // C) Check the result
65 doc.Save(outputPath + "psychomachia_excerpt.pdf", uint(0))
66 fmt.Println("Example 1: psychomachia_excerpt.png")
67
68 // Example 2) Process document using multiple languages
69 // --------------------------------------------------------------------------------
70
71 // A) Setup empty destination doc
72 doc = NewPDFDoc()
73
74 // B) Setup options with multiple target languages, English will always be considered as secondary language
75 opts := NewOCROptions()
76 if use_iris {
77 opts.SetOCREngine("iris")
78 }
79 opts.AddLang("deu")
80 opts.AddLang("fra")
81 opts.AddLang("eng")
82
83 // C) Run OCR on the .jpg with options
84 OCRModuleImageToPDF(doc, inputPath + "multi_lang.jpg", opts)
85
86 // D) Check the result
87 doc.Save(outputPath + "multi_lang.pdf", uint(0))
88 fmt.Println("Example 2: multi_lang.jpg")
89
90 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
91 // --------------------------------------------------------------------------------
92
93 // A) Open the .pdf document
94 doc = NewPDFDoc(inputPath + "german_kids_song.pdf")
95
96 // B) Setup options with a single language and an ignore zone
97 opts = NewOCROptions()
98 if use_iris {
99 opts.SetOCREngine("iris")
100 }
101 opts.AddLang("deu")
102
103 ignoreZones := NewRectCollection()
104 ignoreZones.AddRect(NewRect(424.0, 163.0, 493.0, 730.0))
105 opts.AddIgnoreZonesForPage(ignoreZones, 1)
106
107 // C) Run OCR on the .pdf with options
108 OCRModuleProcessPDF(doc, opts)
109
110 // D) check the result
111 doc.Save(outputPath + "german_kids_song.pdf", uint(0))
112 fmt.Println("Example 3: german_kids_song.pdf")
113
114 // Example 4) Process multi-page tiff with text/ignore zones specified for each page,
115 // --------------------------------------------------------------------------------
116
117 // A) Setup empty destination doc
118 doc = NewPDFDoc()
119
120 // B) Setup options with a single language plus text/ignore zones
121 opts = NewOCROptions()
122 if use_iris {
123 opts.SetOCREngine("iris")
124 }
125 opts.AddLang("eng")
126
127 ignoreZones = NewRectCollection()
128
129 // ignore signature box in the first 2 pages
130 ignoreZones.AddRect(NewRect(1492.0, 56.0, 2236.0, 432.0))
131 opts.AddIgnoreZonesForPage(ignoreZones, 1)
132 opts.AddIgnoreZonesForPage(ignoreZones, 2)
133
134 // can use a combination of ignore and text boxes to focus on the page area of interest,
135 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
136 ignoreZones.Clear()
137 ignoreZones.AddRect(NewRect(992.0, 1276.0, 1368.0, 1372.0))
138 opts.AddIgnoreZonesForPage(ignoreZones, 3)
139
140 textZones := NewRectCollection()
141 // we only have text zones selected in page 3
142 // select horizontal BUFFER ZONE sign
143 textZones.AddRect(NewRect(900.0, 2384.0, 1236.0, 2480.0))
144 // select right vertical BUFFER ZONE sign
145 textZones.AddRect(NewRect(1960.0, 1976.0, 2016.0, 2296.0))
146 // select Lot No.
147 textZones.AddRect(NewRect(696.0, 1028.0, 1196.0, 1128.0))
148
149 // select part of the plan inside the BUFFER ZONE
150 textZones.AddRect(NewRect(428.0, 1484.0, 1784.0, 2344.0))
151 textZones.AddRect(NewRect(948.0, 1288.0, 1672.0, 1476.0))
152 opts.AddTextZonesForPage(textZones, 3)
153
154 // C) Run OCR on the .pdf with options
155 OCRModuleImageToPDF(doc, inputPath + "bc_environment_protection.tif", opts)
156
157 // D) check the result
158 doc.Save(outputPath + "bc_environment_protection.pdf", uint(0))
159 fmt.Println("Example 4: bc_environment_protection.tif")
160
161 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
162 // (e.g., removing words not in the dictionary or filtering out special characters),
163 // and finally applying modified OCR JSON to the source PDF document
164 // --------------------------------------------------------------------------------
165
166 // A) Open the .pdf document
167 doc = NewPDFDoc(inputPath + "zero_value_test_no_text.pdf")
168
169 // B) Run OCR on the .pdf with English language
170
171 opts = NewOCROptions()
172 if use_iris {
173 opts.SetOCREngine("iris")
174 }
175 opts.AddLang("eng")
176
177 // C) Run OCR on the .pdf
178 json := OCRModuleGetOCRJsonFromPDF(doc, opts)
179
180 // D) Post-processing step (whatever it might be)
181 fmt.Println("Have OCR result JSON, re-applying to PDF")
182
183 // E) Apply potentially modified OCR JSON to the PDF
184 OCRModuleApplyOCRJsonToPDF(doc, json)
185
186 // F) Check the result
187 doc.Save(outputPath + "zero_value_test_no_text.pdf", uint(0))
188 fmt.Println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
189
190 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
191 // similar to the one used by TextExtractor
192 // --------------------------------------------------------------------------------
193
194 // A) Setup empty destination doc
195 doc = NewPDFDoc()
196
197 // B) Run OCR on the .tif with English language, extracting OCR results in XML format. Note that
198 // in the process we convert the source image into PDF.
199 // We reuse this PDF document later to add hidden text layer to it.
200 opts = NewOCROptions()
201 if use_iris {
202 opts.SetOCREngine("iris")
203 }
204 opts.AddLang("eng")
205 xml := OCRModuleGetOCRXmlFromImage(doc, inputPath + "physics.tif", opts)
206
207 // C) Post-processing step (whatever it might be)
208 fmt.Println("Have OCR result XML, re-applying to PDF")
209
210 // D) Apply potentially modified OCR XML to the PDF
211 OCRModuleApplyOCRXmlToPDF(doc, xml)
212
213 // E) Check the result
214 doc.Save(outputPath + "physics.pdf", uint(0))
215 fmt.Println("Example 6: extracting and applying OCR XML from physics.tif")
216}
217

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales