OCR to search PDFs and Extract Text - Java Sample Code

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing.

To run this sample, you will need:

  1. Get started with Server SDK in your language/framework
  2. Download an OCR Module

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.sdf.Obj;
7import com.pdftron.sdf.ObjSet;
8import com.pdftron.sdf.SDFDoc;
9import com.pdftron.pdf.*;
10
11import com.pdftron.common.PDFNetException;
12
13//---------------------------------------------------------------------------------------
14// The following sample illustrates how to use OCR module
15//---------------------------------------------------------------------------------------
16public class OCRTest {
17 public static void main(String[] args) {
18 try {
19 // The first step in every application using PDFNet is to initialize the
20 // library and set the path to common PDF resources. The library is usually
21 // initialized only once, but calling Initialize() multiple times is also fine.
22 PDFNet.initialize(PDFTronLicense.Key());
23 PDFNet.addResourceSearchPath("../../../Lib/");
24
25 boolean use_iris = OCRModule.isIRISModuleAvailable();
26 if( !OCRModule.isModuleAvailable() )
27 {
28 System.out.println("");
29 System.out.println("Unable to run OCRTest: Apryse SDK OCR module not available.");
30 System.out.println("---------------------------------------------------------------");
31 System.out.println("The OCR module is an optional add-on, available for download");
32 System.out.println("at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this");
33 System.out.println("module, ensure that the SDK is able to find the required files");
34 System.out.println("using the PDFNet.addResourceSearchPath() function.");
35 System.out.println("");
36 return;
37 }
38
39 // Relative path to the folder containing test files.
40 String input_path = "../../TestFiles/OCR/";
41 String output_path = "../../TestFiles/Output/";
42
43 //--------------------------------------------------------------------------------
44 // Example 1) Process image without specifying options, default language - English - is used
45 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
46 {
47 OCROptions options = new OCROptions();
48 if(use_iris) options.setOCREngine("iris");
49
50 // B) Run OCR on the .png with options
51 OCRModule.imageToPDF(doc, input_path + "psychomachia_excerpt.png", options);
52
53 // C) check the result
54 doc.save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveMode.LINEARIZED, null);
55 System.out.println("Example 1: psychomachia_excerpt.png");
56
57 } catch (Exception e) {
58 e.printStackTrace();
59 }
60
61 //--------------------------------------------------------------------------------
62 // Example 2) Process document using multiple languages
63 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
64 {
65 // B) Setup options with multiple target languages, English will always be considered as secondary language
66 OCROptions options = new OCROptions();
67 if(use_iris) options.setOCREngine("iris");
68 options.addLang("deu");
69 options.addLang("fra");
70 options.addLang("eng");
71
72 // C) Run OCR on the .jpg with options
73 OCRModule.imageToPDF(doc, input_path + "multi_lang.jpg", options);
74
75 // D) check the result
76 doc.save(output_path + "multi_lang.pdf", SDFDoc.SaveMode.LINEARIZED, null);
77 System.out.println("Example 2: multi_lang.jpg");
78 } catch (Exception e) {
79 e.printStackTrace();
80 }
81
82 //--------------------------------------------------------------------------------
83 // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
84 try (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf")) // A) Open the .pdf document
85 {
86 // B) Setup options with a single language and an ignore zone
87 OCROptions options = new OCROptions();
88 if(use_iris) options.setOCREngine("iris");
89 options.addLang("deu");
90
91 RectCollection zones = new RectCollection();
92 zones.addRect(424, 163, 493, 730);
93
94 options.addIgnoreZonesForPage(zones, 1);
95
96 // C) Run OCR on the .pdf with options
97 OCRModule.processPDF(doc, options);
98
99 // D) check the result
100 doc.save(output_path + "german_kids_song.pdf", SDFDoc.SaveMode.LINEARIZED, null);
101 System.out.println("Example 3: german_kids_song.pdf");
102 } catch (Exception e) {
103 e.printStackTrace();
104 }
105
106 //--------------------------------------------------------------------------------
107 // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
108
109 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
110 {
111 // B) Setup options with a single language plus text/ignore zones
112 OCROptions options = new OCROptions();
113 if(use_iris) options.setOCREngine("iris");
114 options.addLang("eng");
115
116 RectCollection zones = new RectCollection();
117 zones.addRect(1492, 56, 2236, 432);
118
119 // ignore signature box in the first 2 pages
120 options.addIgnoreZonesForPage(zones, 1);
121 options.addIgnoreZonesForPage(zones, 2);
122
123 // can use a combination of ignore and text boxes to focus on the page area of interest,
124 // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
125 zones.clear();
126 zones.addRect(992, 1276, 1368, 1372);
127 options.addIgnoreZonesForPage(zones, 3);
128
129 // we only have text zones selected in page 3
130
131 zones.clear();
132 // select horizontal BUFFER ZONE sign
133 zones.addRect(900, 2384, 1236, 2480);
134 // select right vertical BUFFER ZONE sign
135 zones.addRect(1960, 1976, 2016, 2296);
136 // select Lot No.
137 zones.addRect(696, 1028, 1196, 1128);
138
139 // select part of the plan inside the BUFFER ZONE
140 zones.addRect(428, 1484, 1784, 2344);
141 zones.addRect(948, 1288, 1672, 1476);
142
143 options.addTextZonesForPage(zones, 3);
144
145 // C) Run OCR on the .tif with options
146 OCRModule.imageToPDF(doc, input_path + "bc_environment_protection.tif", options);
147
148 // D) check the result
149 doc.save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveMode.LINEARIZED, null);
150 System.out.println("Example 4: bc_environment_protection.tif");
151 } catch (Exception e) {
152 e.printStackTrace();
153 }
154
155 //--------------------------------------------------------------------------------
156 // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
157 // out special characters), and finally applying modified OCR JSON to the source PDF document
158 try (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf")) // A) Open the .pdf document
159 {
160 OCROptions options = new OCROptions();
161 if(use_iris) options.setOCREngine("iris");
162
163 // B) Run OCR on the .pdf with default English language
164 String json = OCRModule.getOCRJsonFromPDF(doc, options);
165
166 // C) Post-processing step (whatever it might be), but we just print json here
167 System.out.println("Have OCR result JSON, re-applying to PDF");
168
169 // D) Apply potentially modified OCR JSON to the PDF
170 OCRModule.applyOCRJsonToPDF(doc, json);
171
172 // E) Check the result
173 doc.save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveMode.LINEARIZED, null);
174 System.out.println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
175 } catch (Exception e) {
176 e.printStackTrace();
177 }
178
179 //--------------------------------------------------------------------------------
180 // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
181 try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
182 {
183 OCROptions options = new OCROptions();
184 if(use_iris) options.setOCREngine("iris");
185
186 // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
187 // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
188 String xml = OCRModule.getOCRXmlFromImage(doc, input_path + "physics.tif", options);
189
190 // C) Post-processing step (whatever it might be), but we just print XML here
191 System.out.println("Have OCR result XML, applying to PDF");
192
193 // D) Apply potentially modified OCR XML to the PDF
194 OCRModule.applyOCRXmlToPDF(doc, xml);
195
196 // E) Check the result
197 doc.save(output_path + "physics.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198 System.out.println("Example 6: extracting and applying OCR XML from physics.tif");
199 }
200 catch (Exception e) {
201 e.printStackTrace();
202 }
203
204 PDFNet.terminate();
205 } catch (Exception e) {
206 e.printStackTrace();
207 }
208 }
209}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales