OCR to search PDFs and Extract Text - Java Sample Code

Requirements
View Demo

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing.

Looking for OCR + WebViewer? Check out our OCR - Showcase Sample Code

Learn more about our Server SDK and OCR capabilities.

Implementation steps

To run this sample, you will need:

  1. Get started with Server SDK in your language/framework
  2. Download OCR Module
  3. Add the sample code provided below

To use this feature in production, your license key will need the OCR Package. Trial keys already include this package.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.sdf.Obj;
7import com.pdftron.sdf.ObjSet;
8import com.pdftron.sdf.SDFDoc;
9import com.pdftron.pdf.*;
10
11import com.pdftron.common.PDFNetException;
12
13//---------------------------------------------------------------------------------------
14// The following sample illustrates how to use OCR module
15//---------------------------------------------------------------------------------------
16public class OCRTest {
17 public static void main(String[] args) {
18 try {
19 // The first step in every application using PDFNet is to initialize the
20 // library and set the path to common PDF resources. The library is usually
21 // initialized only once, but calling Initialize() multiple times is also fine.
22 PDFNet.initialize(PDFTronLicense.Key());
23 PDFNet.addResourceSearchPath("../../../Lib/");
24
25 boolean use_iris = OCRModule.isIRISModuleAvailable();
26 if( !OCRModule.isModuleAvailable() )
27 {
28 System.out.println("");
29 System.out.println("Unable to run OCRTest: Apryse SDK OCR module not available.");
30 System.out.println("---------------------------------------------------------------");
31 System.out.println("The OCR module is an optional add-on, available for download");
32 System.out.println("at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this");
33 System.out.println("module, ensure that the SDK is able to find the required files");
34 System.out.println("using the PDFNet.addResourceSearchPath() function.");
35 System.out.println("");
36 return;
37 }
38
39 // Relative path to the folder containing test files.
40 String input_path = "../../TestFiles/OCR/";
41 String output_path = "../../TestFiles/Output/";
42
43 //--------------------------------------------------------------------------------
44 // Example 1) Process image without specifying options, default language - English - is used
45 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
46 {
47 OCROptions options = new OCROptions();
48 if(use_iris) options.setOCREngine("iris");
49
50 // B) Run OCR on the .png with options
51 OCRModule.imageToPDF(doc, input_path + "psychomachia_excerpt.png", options);
52
53 // C) check the result
54 doc.save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveMode.LINEARIZED, null);
55 System.out.println("Example 1: psychomachia_excerpt.png");
56
57 } catch (Exception e) {
58 e.printStackTrace();
59 }
60
61 //--------------------------------------------------------------------------------
62 // Example 2) Process document using multiple languages
63 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
64 {
65 // B) Setup options with multiple target languages, English will always be considered as secondary language
66 OCROptions options = new OCROptions();
67 if(use_iris) options.setOCREngine("iris");
68 options.addLang("deu");
69 options.addLang("fra");
70 options.addLang("eng");
71
72 // C) Run OCR on the .jpg with options
73 OCRModule.imageToPDF(doc, input_path + "multi_lang.jpg", options);
74
75 // D) check the result
76 doc.save(output_path + "multi_lang.pdf", SDFDoc.SaveMode.LINEARIZED, null);
77 System.out.println("Example 2: multi_lang.jpg");
78 } catch (Exception e) {
79 e.printStackTrace();
80 }
81
82 //--------------------------------------------------------------------------------
83 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
84 try (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf")) // A) Open the .pdf document
85 {
86 // B) Setup options with a single language and an ignore zone
87 OCROptions options = new OCROptions();
88 if(use_iris) options.setOCREngine("iris");
89 options.addLang("deu");
90
91 RectCollection zones = new RectCollection();
92 zones.addRect(424, 163, 493, 730);
93
94 options.addIgnoreZonesForPage(zones, 1);
95
96 // C) Run OCR on the .pdf with options
97 OCRModule.processPDF(doc, options);
98
99 // D) check the result
100 doc.save(output_path + "german_kids_song.pdf", SDFDoc.SaveMode.LINEARIZED, null);
101 System.out.println("Example 3: german_kids_song.pdf");
102 } catch (Exception e) {
103 e.printStackTrace();
104 }
105
106 //--------------------------------------------------------------------------------
107 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
108
109 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
110 {
111 // B) Setup options with a single language plus text/ignore zones
112 OCROptions options = new OCROptions();
113 if(use_iris) options.setOCREngine("iris");
114 options.addLang("eng");
115
116 RectCollection zones = new RectCollection();
117 zones.addRect(1492, 56, 2236, 432);
118
119 // ignore signature box in the first 2 pages
120 options.addIgnoreZonesForPage(zones, 1);
121 options.addIgnoreZonesForPage(zones, 2);
122
123 // can use a combination of ignore and text boxes to focus on the page area of interest,
124 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
125 zones.clear();
126 zones.addRect(992, 1276, 1368, 1372);
127 options.addIgnoreZonesForPage(zones, 3);
128
129 // we only have text zones selected in page 3
130
131 zones.clear();
132 // select horizontal BUFFER ZONE sign
133 zones.addRect(900, 2384, 1236, 2480);
134 // select right vertical BUFFER ZONE sign
135 zones.addRect(1960, 1976, 2016, 2296);
136 // select Lot No.
137 zones.addRect(696, 1028, 1196, 1128);
138
139 // select part of the plan inside the BUFFER ZONE
140 zones.addRect(428, 1484, 1784, 2344);
141 zones.addRect(948, 1288, 1672, 1476);
142
143 options.addTextZonesForPage(zones, 3);
144
145 // C) Run OCR on the .tif with options
146 OCRModule.imageToPDF(doc, input_path + "bc_environment_protection.tif", options);
147
148 // D) check the result
149 doc.save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveMode.LINEARIZED, null);
150 System.out.println("Example 4: bc_environment_protection.tif");
151 } catch (Exception e) {
152 e.printStackTrace();
153 }
154
155 //--------------------------------------------------------------------------------
156 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
157 // out special characters), and finally applying modified OCR JSON to the source PDF document
158 try (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf")) // A) Open the .pdf document
159 {
160 OCROptions options = new OCROptions();
161 if(use_iris) options.setOCREngine("iris");
162
163 // B) Run OCR on the .pdf with default English language
164 String json = OCRModule.getOCRJsonFromPDF(doc, options);
165
166 // C) Post-processing step (whatever it might be), but we just print json here
167 System.out.println("Have OCR result JSON, re-applying to PDF");
168
169 // D) Apply potentially modified OCR JSON to the PDF
170 OCRModule.applyOCRJsonToPDF(doc, json);
171
172 // E) Check the result
173 doc.save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveMode.LINEARIZED, null);
174 System.out.println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
175 } catch (Exception e) {
176 e.printStackTrace();
177 }
178
179 //--------------------------------------------------------------------------------
180 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
181 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
182 {
183 OCROptions options = new OCROptions();
184 if(use_iris) options.setOCREngine("iris");
185
186 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
187 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
188 String xml = OCRModule.getOCRXmlFromImage(doc, input_path + "physics.tif", options);
189
190 // C) Post-processing step (whatever it might be), but we just print XML here
191 System.out.println("Have OCR result XML, applying to PDF");
192
193 // D) Apply potentially modified OCR XML to the PDF
194 OCRModule.applyOCRXmlToPDF(doc, xml);
195
196 // E) Check the result
197 doc.save(output_path + "physics.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198 System.out.println("Example 6: extracting and applying OCR XML from physics.tif");
199 }
200 catch (Exception e) {
201 e.printStackTrace();
202 }
203
204 PDFNet.terminate();
205 } catch (Exception e) {
206 e.printStackTrace();
207 }
208 }
209}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales