OCR to search PDFs and Extract Text - Ruby Sample Code

Requirements

Sample code shows how to use the Apryse Server OCR module on scanned documents in multiple languages; provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB. The OCR module can make searchable PDFs and extract scanned text for further indexing.

Looking for OCR + WebViewer? Check out our OCR - Showcase Sample Code

Learn more about our Server SDK and OCR capabilities.

Implementation steps

To run this sample, you will need:

Get started with Server SDK in your language/framework
Download OCR Module
Add the sample code provided below

To use this feature in production, your license key will need the OCR Package. Trial keys already include this package.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.     
4//---------------------------------------------------------------------------------------
5
6using System;
7using pdftron;
8using pdftron.Common;
9using pdftron.SDF;
10using pdftron.PDF;
11
12namespace OCRTestCS
13{
14    
15    /// <summary>
16    //---------------------------------------------------------------------------------------
17    // The following sample illustrates how to use OCR module
18    //---------------------------------------------------------------------------------------
19    /// </summary>
20    class Class1
21    {
22        private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
23        static Class1() {}
24        
25        /// <summary>
26        /// The main entry point for the application.
27        /// </summary>
28        static void Main(string[] args)
29        {
30            // The first step in every application using PDFNet is to initialize the 
31            // library and set the path to common PDF resources. The library is usually 
32            // initialized only once, but calling Initialize() multiple times is also fine.
33            PDFNet.Initialize(PDFTronLicense.Key);
34
35            // Can optionally set path to the OCR module
36            PDFNet.AddResourceSearchPath("../../../../../Lib/");
37
38            // if the IRIS OCR module is available, will use that instead of the default
39            bool use_iris = OCRModule.IsIRISModuleAvailable();
40            if( !OCRModule.IsModuleAvailable() )
41            {
42                Console.WriteLine("");
43                Console.WriteLine("Unable to run OCRTest: Apryse SDK OCR module not available.");
44                Console.WriteLine("---------------------------------------------------------------");
45                Console.WriteLine("The OCR module is an optional add-on, available for download");
46                Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this");
47                Console.WriteLine("module, ensure that the SDK is able to find the required files");
48                Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49                Console.WriteLine("");
50                return;
51            }
52
53            // Relative path to the folder containing test files.
54            string input_path =  "../../../../TestFiles/OCR/";
55            string output_path = "../../../../TestFiles/Output/";
56
57            //--------------------------------------------------------------------------------
58            // Example 1) Process image
59            try
60            {
61
62                // A) Setup empty destination doc
63                using (PDFDoc doc = new PDFDoc())
64                {
65                    // B) Set English as the language of choice
66                    OCROptions opts = new OCROptions();
67                    if(use_iris) opts.SetOCREngine("iris");
68                    opts.AddLang("eng");
69
70                    // C) Run OCR on the .png with options            
71                    OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts);
72
73                    // D) check the result
74                    doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused);
75
76                    Console.WriteLine("Example 1: psychomachia_excerpt.png");
77                }
78
79            }
80            catch (PDFNetException e)
81            {
82                Console.WriteLine(e.Message);
83            }
84
85            //--------------------------------------------------------------------------------
86            // Example 2) Process document using multiple languages
87            try
88            {
89
90                // A) Setup empty destination doc
91                using (PDFDoc doc = new PDFDoc())
92                {
93
94                    // B) Setup options with multiple target languages, English will always be considered as secondary language
95                    OCROptions opts = new OCROptions();
96                    if(use_iris) opts.SetOCREngine("iris");
97                    opts.AddLang("deu");
98                    opts.AddLang("fra");
99                    opts.AddLang("eng");
100
101                    // C) Run OCR on the .jpg with options        
102                    OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts);
103
104                    // D) check the result
105                    doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused);
106
107                    Console.WriteLine("Example 2: multi_lang.jpg");
108                }
109
110            }
111            catch (PDFNetException e)
112            {
113                Console.WriteLine(e.Message);
114            }
115
116            //--------------------------------------------------------------------------------
117            // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
118            try
119            {
120
121                // A) Open the .pdf document
122                using (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf"))
123                {
124
125                    // B) Setup options with a single language and an ignore zone
126                    OCROptions opts = new OCROptions();
127                    if(use_iris) opts.SetOCREngine("iris");
128                    opts.AddLang("deu");
129
130                    RectCollection ignoreZones = new RectCollection();
131                    ignoreZones.AddRect(424, 163, 493, 730);
132                    opts.AddIgnoreZonesForPage(ignoreZones, 1);
133
134                    // C) Run OCR on the .pdf with options
135                    OCRModule.ProcessPDF(doc, opts);
136
137                    // D) check the result
138                    doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused);
139
140                    Console.WriteLine("Example 3: german_kids_song.pdf");
141                }
142
143            }
144            catch (PDFNetException e)
145            {
146                Console.WriteLine(e.Message);
147            }
148
149            //--------------------------------------------------------------------------------
150            // Example 4) Process multipage tiff with text/ignore zones specified for each page
151            try
152            {
153
154                // A) Setup empty destination doc
155                using (PDFDoc doc = new PDFDoc())
156                {
157
158                    // B) Setup options with a single language plus text/ignore zones
159                    OCROptions opts = new OCROptions();
160                    if(use_iris) opts.SetOCREngine("iris");
161                    opts.AddLang("eng");
162
163                    RectCollection zones = new RectCollection();
164
165
166                    // ignore signature box in the first 2 pages
167                    zones.AddRect(1492, 56, 2236, 432);
168                    opts.AddIgnoreZonesForPage(zones, 1);
169                    zones.Clear();
170
171                    zones.AddRect(1492, 56, 2236, 432);
172                    opts.AddIgnoreZonesForPage(zones, 2);
173                    zones.Clear();
174
175                    // can use a combination of ignore and text boxes to focus on the page area of interest,
176                    // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
177                    zones.AddRect(992, 1276, 1368, 1372);
178                    opts.AddIgnoreZonesForPage(zones, 3);
179                    zones.Clear();
180
181                    // select horizontal BUFFER ZONE sign
182                    zones.AddRect(900, 2384, 1236, 2480);
183                    // select right vertical BUFFER ZONE sign
184                    zones.AddRect(1960, 1976, 2016, 2296);
185                    // select Lot No.
186                    zones.AddRect(696, 1028, 1196, 1128);
187
188                    // select part of the plan inside the BUFFER ZONE
189                    zones.AddRect(428, 1484, 1784, 2344);
190                    zones.AddRect(948, 1288, 1672, 1476);
191                    opts.AddTextZonesForPage(zones, 3);
192
193                    // C) Run OCR on the .pdf with options
194                    OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts);
195
196                    // D) check the result
197                    doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused);
198
199                    Console.WriteLine("Example 4: bc_environment_protection.tif");
200                }
201
202            }
203            catch (PDFNetException e)
204            {
205                Console.WriteLine(e.Message);
206            }
207
208            //--------------------------------------------------------------------------------
209            // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
210            // out special characters), and finally applying modified OCR JSON to the source PDF document 
211            try
212            {
213
214                // A) Open the .pdf document
215                using (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf"))
216                {
217
218                    // B) set English language
219                    OCROptions opts = new OCROptions();
220                    if(use_iris) opts.SetOCREngine("iris");
221                    opts.AddLang("eng");
222
223
224                    // C) Run OCR on the .pdf 
225                    string json = OCRModule.GetOCRJsonFromPDF(doc, opts);
226
227                    // D) Post-processing step (whatever it might be), but we just print JSON here
228                    Console.WriteLine("Have OCR result JSON, re-applying to PDF");
229
230                    // E) Apply potentially modified OCR JSON to the PDF
231                    OCRModule.ApplyOCRJsonToPDF(doc, json);
232
233                    // F) check the result
234                    doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused);
235
236                    Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
237                }
238
239            }
240            catch (PDFNetException e)
241            {
242                Console.WriteLine(e.Message);
243            }
244
245            //--------------------------------------------------------------------------------
246            // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
247            try
248            {
249
250                // A) Setup empty destination doc
251                using (PDFDoc doc = new PDFDoc())
252                {
253
254                    // B) set English language
255                    OCROptions opts = new OCROptions();
256                    if(use_iris) opts.SetOCREngine("iris");
257                    opts.AddLang("eng");
258
259                    // C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
260                    // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
261
262                    string xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts);
263
264                    // D) Post-processing step (whatever it might be), but we just print XML here
265                    Console.WriteLine("Have OCR result XML, re-applying to PDF");
266
267                    // E) Apply potentially modified OCR XML to the PDF
268                    OCRModule.ApplyOCRXmlToPDF(doc, xml);
269
270                    // F) check the result
271                    doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused);
272
273                    Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif");
274                }
275
276            }
277            catch (PDFNetException e)
278            {
279                Console.WriteLine(e.Message);
280            }
281
282            PDFNet.Terminate();
283        }
284
285    }
286}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package main
7
8import (
9	"fmt"
10	. "github.com/pdftron/pdftron-go/v2"
11)
12
13// Relative path to the folder containing test files.
14var inputPath =  "../../TestFiles/OCR/"
15var outputPath = "../../TestFiles/Output/"
16
17// ---------------------------------------------------------------------------------------
18// The following sample illustrates how to use OCR module
19// --------------------------------------------------------------------------------------
20
21func main(){
22
23    // The first step in every application using PDFNet is to initialize the 
24	// library and set the path to common PDF resources. The library is usually 
25	// initialized only once, but calling Initialize() multiple times is also fine.
26	var licenseKey = "YOUR_LICENSE_KEY"
27	PDFNetInitialize(licenseKey)
28
29	var iris_installed = true // Set to true if the IRIS OCR module is installed and you wish to use it
30
31	// The location of the OCR Module
32	if iris_installed {
33		PDFNetAddResourceSearchPath("../../IRISOCRModuleWindows/Lib")
34	} else {
35		PDFNetAddResourceSearchPath("../../OCRModuleWindows/Lib")
36	}
37
38	var use_iris = OCRModuleIsIRISModuleAvailable()
39
40	if ! OCRModuleIsModuleAvailable(){
41		fmt.Println("Unable to run OCRTest: PDFTron SDK OCR module not available.\n" +
42		"---------------------------------------------------------------\n" +
43		"The OCR module is an optional add-on, available for download\n" +
44		"at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this\n" +
45		"module, ensure that the SDK is able to find the required files\n" +
46		"using the PDFNetAddResourceSearchPath() function.")
47		return
48	}
49
50	// Example 1) Process image without specifying options, default language - English - is used
51	// --------------------------------------------------------------------------------
52
53	// A) Setup empty destination doc
54	doc := NewPDFDoc()
55
56	// B) Run OCR on the .png with options
57	ocrOpts := NewOCROptions()
58	if use_iris {
59		ocrOpts.SetOCREngine("iris")
60	}
61		
62	OCRModuleImageToPDF(doc, inputPath + "psychomachia_excerpt.png", ocrOpts)
63
64	// C) Check the result
65	doc.Save(outputPath + "psychomachia_excerpt.pdf", uint(0))
66	fmt.Println("Example 1: psychomachia_excerpt.png")
67
68	// Example 2) Process document using multiple languages
69	// --------------------------------------------------------------------------------
70
71	// A) Setup empty destination doc
72	doc = NewPDFDoc()
73
74	// B) Setup options with multiple target languages, English will always be considered as secondary language
75	opts := NewOCROptions()
76	if use_iris {
77		opts.SetOCREngine("iris")
78	}
79	opts.AddLang("deu")
80	opts.AddLang("fra")
81	opts.AddLang("eng")
82
83	// C) Run OCR on the .jpg with options
84	OCRModuleImageToPDF(doc, inputPath + "multi_lang.jpg", opts)
85
86	// D) Check the result
87	doc.Save(outputPath + "multi_lang.pdf", uint(0))
88	fmt.Println("Example 2: multi_lang.jpg")
89
90	// Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
91	// --------------------------------------------------------------------------------
92
93	// A) Open the .pdf document
94	doc = NewPDFDoc(inputPath + "german_kids_song.pdf")
95
96	// B) Setup options with a single language and an ignore zone
97	opts = NewOCROptions()
98	if use_iris {
99		opts.SetOCREngine("iris")
100	}
101	opts.AddLang("deu")
102
103	ignoreZones := NewRectCollection()
104	ignoreZones.AddRect(NewRect(424.0, 163.0, 493.0, 730.0))
105	opts.AddIgnoreZonesForPage(ignoreZones, 1)
106
107	// C) Run OCR on the .pdf with options
108	OCRModuleProcessPDF(doc, opts)
109
110	// D) check the result
111	doc.Save(outputPath + "german_kids_song.pdf", uint(0))
112	fmt.Println("Example 3: german_kids_song.pdf")
113
114	// Example 4) Process multi-page tiff with text/ignore zones specified for each page,
115	// --------------------------------------------------------------------------------
116
117	// A) Setup empty destination doc
118	doc = NewPDFDoc()
119
120	// B) Setup options with a single language plus text/ignore zones
121	opts = NewOCROptions()
122	if use_iris {
123		opts.SetOCREngine("iris")
124	}
125	opts.AddLang("eng")
126
127	ignoreZones = NewRectCollection()
128
129	// ignore signature box in the first 2 pages
130	ignoreZones.AddRect(NewRect(1492.0, 56.0, 2236.0, 432.0))
131	opts.AddIgnoreZonesForPage(ignoreZones, 1)
132	opts.AddIgnoreZonesForPage(ignoreZones, 2)
133
134	// can use a combination of ignore and text boxes to focus on the page area of interest,
135	// as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
136	ignoreZones.Clear()
137	ignoreZones.AddRect(NewRect(992.0, 1276.0, 1368.0, 1372.0))
138	opts.AddIgnoreZonesForPage(ignoreZones, 3)
139
140	textZones := NewRectCollection()
141	// we only have text zones selected in page 3
142	// select horizontal BUFFER ZONE sign
143	textZones.AddRect(NewRect(900.0, 2384.0, 1236.0, 2480.0))
144	// select right vertical BUFFER ZONE sign
145	textZones.AddRect(NewRect(1960.0, 1976.0, 2016.0, 2296.0))
146	// select Lot No.
147	textZones.AddRect(NewRect(696.0, 1028.0, 1196.0, 1128.0))
148
149	// select part of the plan inside the BUFFER ZONE
150	textZones.AddRect(NewRect(428.0, 1484.0, 1784.0, 2344.0))
151	textZones.AddRect(NewRect(948.0, 1288.0, 1672.0, 1476.0))
152	opts.AddTextZonesForPage(textZones, 3)
153
154	// C) Run OCR on the .pdf with options
155	OCRModuleImageToPDF(doc, inputPath + "bc_environment_protection.tif", opts)
156
157	// D) check the result
158	doc.Save(outputPath + "bc_environment_protection.pdf", uint(0))
159	fmt.Println("Example 4: bc_environment_protection.tif")
160
161	// Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
162	// (e.g., removing words not in the dictionary or filtering out special characters),
163	// and finally applying modified OCR JSON to the source PDF document
164	// --------------------------------------------------------------------------------
165
166	// A) Open the .pdf document
167	doc = NewPDFDoc(inputPath + "zero_value_test_no_text.pdf")
168
169	// B) Run OCR on the .pdf with English language
170
171	opts = NewOCROptions()
172	if use_iris {
173		opts.SetOCREngine("iris")
174	}
175	opts.AddLang("eng")
176
177	// C) Run OCR on the .pdf
178	json := OCRModuleGetOCRJsonFromPDF(doc, opts)
179
180	// D) Post-processing step (whatever it might be)
181	fmt.Println("Have OCR result JSON, re-applying to PDF")
182	
183	// E) Apply potentially modified OCR JSON to the PDF
184	OCRModuleApplyOCRJsonToPDF(doc, json)
185
186	// F) Check the result
187	doc.Save(outputPath + "zero_value_test_no_text.pdf", uint(0))
188	fmt.Println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
189
190	// Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
191	// similar to the one used by TextExtractor
192	// --------------------------------------------------------------------------------
193
194	// A) Setup empty destination doc
195	doc = NewPDFDoc()
196
197	// B) Run OCR on the .tif with English language, extracting OCR results in XML format. Note that
198	// in the process we convert the source image into PDF.
199	// We reuse this PDF document later to add hidden text layer to it.
200	opts = NewOCROptions()
201	if use_iris {
202		opts.SetOCREngine("iris")
203	}
204	opts.AddLang("eng")
205	xml := OCRModuleGetOCRXmlFromImage(doc, inputPath + "physics.tif", opts)
206
207	// C) Post-processing step (whatever it might be)
208	fmt.Println("Have OCR result XML, re-applying to PDF")
209
210	// D) Apply potentially modified OCR XML to the PDF
211	OCRModuleApplyOCRXmlToPDF(doc, xml)
212
213	// E) Check the result
214	doc.Save(outputPath + "physics.pdf", uint(0))
215	fmt.Println("Example 6: extracting and applying OCR XML from physics.tif")
216}
217

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5#include <PDF/PDFNet.h>
6#include <PDF/PDFDoc.h>
7#include <PDF/OCRModule.h>
8#include <PDF/OCROptions.h>
9#include <SDF/Obj.h>
10#include <iostream>
11#include "../../LicenseKey/CPP/LicenseKey.h"
12
13using namespace std;
14using namespace pdftron;
15using namespace PDF;
16using namespace SDF;
17
18//---------------------------------------------------------------------------------------
19// The following sample illustrates how to use OCR module
20//---------------------------------------------------------------------------------------
21int main(int argc, char *argv[])
22{
23	try 
24	{
25		// The first step in every application using PDFNet is to initialize the 
26		// library and set the path to common PDF resources. The library is usually 
27		// initialized only once, but calling Initialize() multiple times is also fine.
28		PDFNet::Initialize(LicenseKey);
29		// The location of the OCR Module
30		PDFNet::AddResourceSearchPath("../../../Lib/");
31
32		// if the IRIS OCR module is available, will use that instead of the default
33		const bool use_iris = OCRModule::IsIRISModuleAvailable();
34		if(!OCRModule::IsModuleAvailable())
35		{
36			cout << endl;
37			cout << "Unable to run OCRTest: Apryse SDK OCR module not available." << endl;
38			cout << "---------------------------------------------------------------" << endl;
39			cout << "The OCR module is an optional add-on, available for download" << endl;
40			cout << "at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this" << endl;
41			cout << "module, ensure that the SDK is able to find the required files" << endl;
42			cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
43			return 0;
44		}
45
46		// Relative path to the folder containing test files.
47		string input_path =  "../../TestFiles/OCR/";
48		string output_path = "../../TestFiles/Output/";
49
50
51		//--------------------------------------------------------------------------------
52		// Example 1) Process image without specifying options, default language - English - is used
53		try 
54		{
55
56			// A) Setup empty destination doc
57
58			PDFDoc doc;
59
60			// B) Run OCR on the .png without options
61
62			OCROptions opts;
63			if(use_iris) opts.SetOCREngine("iris");
64			OCRModule::ImageToPDF(doc, input_path + "psychomachia_excerpt.png", &opts);
65
66			// C) check the result
67
68			doc.Save(output_path + "psychomachia_excerpt.pdf", 0, 0);
69			
70			cout << "Example 1: psychomachia_excerpt.png" << endl;
71
72		}
73		catch(Common::Exception& e)	
74		{
75			cout << e << endl;
76		}
77		catch(...) 
78		{
79			cout << "Unknown Exception" << endl;
80		}
81
82		//--------------------------------------------------------------------------------
83		// Example 2) Process document using multiple languages
84		try
85		{
86			// A) Setup empty destination doc
87
88			PDFDoc doc;
89
90			// B) Setup options with multiple target languages, English will always be considered as secondary language
91
92			OCROptions opts;
93			if(use_iris) opts.SetOCREngine("iris");
94			opts.AddLang("deu");
95			opts.AddLang("fra");
96			opts.AddLang("eng");
97
98			// C) Run OCR on the .jpg with options
99
100			OCRModule::ImageToPDF(doc, input_path + "multi_lang.jpg", &opts);
101
102			// D) check the result
103
104			doc.Save(output_path + "multi_lang.pdf", 0, 0);
105
106			cout << "Example 2: multi_lang.jpg" << endl;
107
108		}
109		catch (Common::Exception& e)
110		{
111			cout << e << endl;
112		}
113		catch (...)
114		{
115			cout << "Unknown Exception" << endl;
116		}
117
118		//--------------------------------------------------------------------------------
119		// Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
120		try
121		{
122			// A) Open the .pdf document
123
124			PDFDoc doc((input_path + "german_kids_song.pdf").c_str());
125
126			// B) Setup options with a single language and an ignore zone
127
128			OCROptions opts;
129			if(use_iris) opts.SetOCREngine("iris");
130			opts.AddLang("deu");
131
132			RectCollection ignore_zones;
133			ignore_zones.AddRect(424, 163, 493, 730);
134			opts.AddIgnoreZonesForPage(ignore_zones, 1);
135
136			// C) Run OCR on the .pdf with options
137
138			OCRModule::ProcessPDF(doc, &opts);
139
140			// D) check the result
141
142			PDFDoc doc_out(doc);
143			doc_out.Save(output_path + "german_kids_song.pdf", 0, 0);
144
145			cout << "Example 3: german_kids_song.pdf" << endl;
146		}
147		catch (Common::Exception& e)
148		{
149			cout << e << endl;
150		}
151		catch (...)
152		{
153			cout << "Unknown Exception" << endl;
154		}
155
156		//--------------------------------------------------------------------------------
157		// Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
158		try
159		{
160			// A) Setup empty destination doc
161
162			PDFDoc doc;
163
164			// B) Setup options with a single language plus text/ignore zones
165
166			OCROptions opts;
167			if(use_iris) opts.SetOCREngine("iris");
168			opts.AddLang("eng");
169
170			RectCollection ignore_zones;
171			// ignore signature box in the first 2 pages
172			ignore_zones.AddRect(1492, 56, 2236, 432);
173			opts.AddIgnoreZonesForPage(ignore_zones, 1);
174			opts.AddIgnoreZonesForPage(ignore_zones, 2);
175
176			// can use a combination of ignore and text boxes to focus on the page area of interest,
177			// as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
178			ignore_zones.Clear();
179			ignore_zones.AddRect(992, 1276, 1368, 1372);
180			opts.AddIgnoreZonesForPage(ignore_zones, 3);
181
182			RectCollection text_zones;
183			// we only have text zones selected in page 3
184
185			// select horizontal BUFFER ZONE sign
186			text_zones.AddRect(900, 2384, 1236, 2480);
187			// select right vertical BUFFER ZONE sign
188			text_zones.AddRect(1960, 1976, 2016, 2296);
189			// select Lot No.
190			text_zones.AddRect(696, 1028, 1196, 1128);
191
192			// select part of the plan inside the BUFFER ZONE
193			text_zones.AddRect(428, 1484, 1784, 2344);
194			text_zones.AddRect(948, 1288, 1672, 1476);
195			opts.AddTextZonesForPage(text_zones, 3);
196
197			// C) Run OCR on the .tif with options
198
199			OCRModule::ImageToPDF(doc, input_path + "bc_environment_protection.tif", &opts);
200
201			// D) check the result
202
203			doc.Save(output_path + "bc_environment_protection.pdf", 0, 0);
204
205			cout << "Example 4: bc_environment_protection.tif" << endl;
206
207		}
208		catch (Common::Exception& e)
209		{
210			cout << e << endl;
211		}
212		catch (...)
213		{
214			cout << "Unknown Exception" << endl;
215		}
216
217		//--------------------------------------------------------------------------------
218		// Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
219		// out special characters), and finally applying modified OCR JSON to the source PDF document 
220		try
221		{
222
223			// A) Open the .pdf document
224
225			PDFDoc doc((input_path + "zero_value_test_no_text.pdf").c_str());
226
227			// B) Run OCR on the .pdf with default English language
228			OCROptions opts;
229			if(use_iris) opts.SetOCREngine("iris");
230
231			UString json = OCRModule::GetOCRJsonFromPDF(doc, &opts);
232
233			// C) Post-processing step (whatever it might be)
234
235			cout << "Have OCR result JSON, re-applying to PDF " << endl;
236
237			// D) Apply potentially modified OCR JSON to the PDF
238
239			OCRModule::ApplyOCRJsonToPDF(doc, json);
240
241			// E) Check the result
242
243			PDFDoc doc_out(doc);
244			doc_out.Save(output_path + "zero_value_test_no_text.pdf", 0, 0);
245
246			cout << "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf" << endl;
247
248		}
249		catch (Common::Exception& e)
250		{
251			cout << e << endl;
252		}
253		catch (...)
254		{
255			cout << "Unknown Exception" << endl;
256		}
257
258		//--------------------------------------------------------------------------------
259		// Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
260		try
261		{
262
263			// A) Setup empty destination doc
264
265			PDFDoc doc;
266
267			// B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
268			// in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
269			
270			OCROptions opts;
271			if(use_iris) opts.SetOCREngine("iris");
272			UString xml = OCRModule::GetOCRXmlFromImage(doc, input_path + "physics.tif", NULL);
273
274			// C) Post-processing step (whatever it might be)
275
276			cout << "Have OCR result XML, re-applying to PDF" << endl;
277
278			// D) Apply potentially modified OCR XML to the PDF
279
280			OCRModule::ApplyOCRXmlToPDF(doc, xml);
281
282			// E) Check the result
283
284			PDFDoc doc_out(doc);
285			doc_out.Save(output_path + "physics.pdf", 0, 0);
286
287			cout << "Example 6: extracting and applying OCR XML from physics.tif" << endl;
288
289		}
290		catch (Common::Exception& e)
291		{
292			cout << e << endl;
293		}
294		catch (...)
295		{
296			cout << "Unknown Exception" << endl;
297		}
298
299		cout << "Done." << endl;
300
301		PDFNet::Terminate();
302	}
303	catch(Common::Exception& e)	
304	{
305		cout << e << endl;
306	}
307	catch (...) {
308		cout << "Unknown Exception" << endl;
309	}
310
311	return 0;	
312}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.sdf.Obj;
7import com.pdftron.sdf.ObjSet;
8import com.pdftron.sdf.SDFDoc;
9import com.pdftron.pdf.*;
10
11import com.pdftron.common.PDFNetException;
12
13//---------------------------------------------------------------------------------------
14// The following sample illustrates how to use OCR module
15//---------------------------------------------------------------------------------------
16public class OCRTest {
17	public static void main(String[] args) {
18		try {
19			// The first step in every application using PDFNet is to initialize the
20			// library and set the path to common PDF resources. The library is usually
21			// initialized only once, but calling Initialize() multiple times is also fine.
22			PDFNet.initialize(PDFTronLicense.Key());
23			PDFNet.addResourceSearchPath("../../../Lib/");
24
25			boolean use_iris = OCRModule.isIRISModuleAvailable();
26			if( !OCRModule.isModuleAvailable() )
27			{
28				System.out.println("");
29				System.out.println("Unable to run OCRTest: Apryse SDK OCR module not available.");
30				System.out.println("---------------------------------------------------------------");
31				System.out.println("The OCR module is an optional add-on, available for download");
32				System.out.println("at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this");
33				System.out.println("module, ensure that the SDK is able to find the required files");
34				System.out.println("using the PDFNet.addResourceSearchPath() function.");
35				System.out.println("");
36				return;
37			}
38
39			// Relative path to the folder containing test files.
40			String input_path = "../../TestFiles/OCR/";
41			String output_path = "../../TestFiles/Output/";
42
43			//--------------------------------------------------------------------------------
44			// Example 1) Process image without specifying options, default language - English - is used
45			try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
46			{
47				OCROptions options = new OCROptions();
48				if(use_iris) options.setOCREngine("iris");
49
50				// B) Run OCR on the .png with options
51				OCRModule.imageToPDF(doc, input_path + "psychomachia_excerpt.png", options);
52
53				// C) check the result
54				doc.save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveMode.LINEARIZED, null);
55				System.out.println("Example 1: psychomachia_excerpt.png");
56				
57			} catch (Exception e) {
58				e.printStackTrace();
59			}
60
61			//--------------------------------------------------------------------------------
62			// Example 2) Process document using multiple languages
63			try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
64			{
65				// B) Setup options with multiple target languages, English will always be considered as secondary language
66				OCROptions options = new OCROptions();
67				if(use_iris) options.setOCREngine("iris");
68				options.addLang("deu");
69				options.addLang("fra");
70				options.addLang("eng");
71
72				// C) Run OCR on the .jpg with options
73				OCRModule.imageToPDF(doc, input_path + "multi_lang.jpg", options);
74
75				// D) check the result
76				doc.save(output_path + "multi_lang.pdf", SDFDoc.SaveMode.LINEARIZED, null);
77				System.out.println("Example 2: multi_lang.jpg");
78			} catch (Exception e) {
79				e.printStackTrace();
80			}
81
82			//--------------------------------------------------------------------------------
83			// Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
84			try (PDFDoc doc = new PDFDoc(input_path + "german_kids_song.pdf")) // A) Open the .pdf document
85			{
86				// B) Setup options with a single language and an ignore zone
87				OCROptions options = new OCROptions();
88				if(use_iris) options.setOCREngine("iris");
89				options.addLang("deu");
90
91				RectCollection zones = new RectCollection();
92				zones.addRect(424, 163, 493, 730);
93
94				options.addIgnoreZonesForPage(zones, 1);
95
96				// C) Run OCR on the .pdf with options
97				OCRModule.processPDF(doc, options);
98				
99				// D) check the result
100				doc.save(output_path + "german_kids_song.pdf", SDFDoc.SaveMode.LINEARIZED, null);
101				System.out.println("Example 3: german_kids_song.pdf");
102			} catch (Exception e) {
103				e.printStackTrace();
104			}
105
106			//--------------------------------------------------------------------------------
107			// Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
108
109			try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
110			{
111				// B) Setup options with a single language plus text/ignore zones
112				OCROptions options = new OCROptions();
113				if(use_iris) options.setOCREngine("iris");
114				options.addLang("eng");
115
116				RectCollection zones = new RectCollection();
117				zones.addRect(1492, 56, 2236, 432);
118
119				// ignore signature box in the first 2 pages
120				options.addIgnoreZonesForPage(zones, 1);
121				options.addIgnoreZonesForPage(zones, 2);
122
123				// can use a combination of ignore and text boxes to focus on the page area of interest,
124				// as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
125				zones.clear();
126				zones.addRect(992, 1276, 1368, 1372);
127				options.addIgnoreZonesForPage(zones, 3);
128
129				// we only have text zones selected in page 3
130
131				zones.clear();
132				// select horizontal BUFFER ZONE sign
133				zones.addRect(900, 2384, 1236, 2480);
134				// select right vertical BUFFER ZONE sign
135				zones.addRect(1960, 1976, 2016, 2296);
136				// select Lot No.
137				zones.addRect(696, 1028, 1196, 1128);
138
139				// select part of the plan inside the BUFFER ZONE
140				zones.addRect(428, 1484, 1784, 2344);
141				zones.addRect(948, 1288, 1672, 1476);
142
143				options.addTextZonesForPage(zones, 3);
144
145				// C) Run OCR on the .tif with options
146				OCRModule.imageToPDF(doc, input_path + "bc_environment_protection.tif", options);
147				
148				// D) check the result
149				doc.save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveMode.LINEARIZED, null);
150				System.out.println("Example 4: bc_environment_protection.tif");
151			} catch (Exception e) {
152				e.printStackTrace();
153			}
154
155			//--------------------------------------------------------------------------------
156			// Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
157			// out special characters), and finally applying modified OCR JSON to the source PDF document 
158			try (PDFDoc doc = new PDFDoc(input_path + "zero_value_test_no_text.pdf")) // A) Open the .pdf document
159			{
160				OCROptions options = new OCROptions();
161				if(use_iris) options.setOCREngine("iris");
162
163				// B) Run OCR on the .pdf with default English language
164				String json = OCRModule.getOCRJsonFromPDF(doc, options);
165
166				// C) Post-processing step (whatever it might be), but we just print json here
167				System.out.println("Have OCR result JSON, re-applying to PDF");
168
169				// D) Apply potentially modified OCR JSON to the PDF
170				OCRModule.applyOCRJsonToPDF(doc, json);
171
172				// E) Check the result
173				doc.save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveMode.LINEARIZED, null);
174				System.out.println("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf");
175			} catch (Exception e) {
176				e.printStackTrace();
177			}
178
179			//--------------------------------------------------------------------------------
180			// Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
181			try (PDFDoc doc = new PDFDoc()) // A) Setup empty destination doc
182			{
183				OCROptions options = new OCROptions();
184				if(use_iris) options.setOCREngine("iris");
185
186				// B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
187				// in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
188				String xml = OCRModule.getOCRXmlFromImage(doc, input_path + "physics.tif", options);
189
190				// C) Post-processing step (whatever it might be), but we just print XML here
191				System.out.println("Have OCR result XML, applying to PDF");
192
193				// D) Apply potentially modified OCR XML to the PDF
194				OCRModule.applyOCRXmlToPDF(doc, xml);
195
196				// E) Check the result
197				doc.save(output_path + "physics.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198				System.out.println("Example 6: extracting and applying OCR XML from physics.tif");
199			}
200			catch (Exception e) {
201				e.printStackTrace();
202			}
203
204			PDFNet.terminate();
205		} catch (Exception e) {
206			e.printStackTrace();
207		}
208	}
209}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11  'use strict';
12
13  //---------------------------------------------------------------------------------------
14  // The following sample illustrates how to use OCR module
15  //---------------------------------------------------------------------------------------
16  exports.runOCRTest = () => {
17    const main = async () => {
18      try {
19
20        PDFNet.addResourceSearchPath('../../lib/');
21
22        const useIRIS = await PDFNet.OCRModule.isIRISModuleAvailable();
23        if (!(await PDFNet.OCRModule.isModuleAvailable())) {
24          console.log('\nUnable to run OCRTest: Apryse SDK OCR module not available.');
25          console.log('---------------------------------------------------------------');
26          console.log('The OCR module is an optional add-on, available for download');
27          console.log('at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this');
28          console.log('module, ensure that the SDK is able to find the required files');
29          console.log('using the PDFNet.addResourceSearchPath() function.\n');
30
31          return;
32        }
33
34        // Relative path to the folder containing test files.
35        const input_path = '../TestFiles/OCR/';
36        const output_path = '../TestFiles/Output/';
37
38        //--------------------------------------------------------------------------------
39        // Example 1) Process image without specifying options, default language - English - is used
40        try {
41
42          // A) Setup empty destination doc
43          const doc = await PDFNet.PDFDoc.create();
44          
45          await doc.initSecurityHandler();
46
47          const opts = new PDFNet.OCRModule.OCROptions();
48          if(useIRIS) opts.setOCREngine('iris');
49
50          // B) Run OCR on the .png with options
51          await PDFNet.OCRModule.imageToPDF(doc, input_path + 'psychomachia_excerpt.png', opts);
52
53          // C) check the result
54          await doc.save(output_path + 'psychomachia_excerpt.pdf', 0);
55
56          console.log('Example 1: psychomachia_excerpt.png');
57
58        } catch (err) {
59          console.log(err);
60        }
61
62        //--------------------------------------------------------------------------------
63        // Example 2) Process document using multiple languages
64        try {
65          // A) Setup empty destination doc
66          const doc = await PDFNet.PDFDoc.create();
67          await doc.initSecurityHandler();
68
69          // B) Setup options with multiple target languages, English will always be considered as secondary language
70          const opts = new PDFNet.OCRModule.OCROptions();
71          if(useIRIS) opts.setOCREngine('iris');
72          opts.addLang('deu');
73          opts.addLang('fra');
74          opts.addLang('eng');
75
76          // C) Run OCR on the .jpg with options
77          await PDFNet.OCRModule.imageToPDF(doc, input_path + 'multi_lang.jpg', opts);
78
79          // D) check the result
80          await doc.save(output_path + 'multi_lang.pdf', 0);
81
82          console.log('Example 2: multi_lang.jpg');
83        } catch (err) {
84          console.log(err);
85        }
86
87        //--------------------------------------------------------------------------------
88        // Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
89        try {
90          // A) Open the .pdf document
91          const doc = await PDFNet.PDFDoc.createFromFilePath(input_path + 'german_kids_song.pdf');
92          doc.initSecurityHandler();
93
94          // B) Setup options with a single language and an ignore zone
95          const opts = new PDFNet.OCRModule.OCROptions();
96          if(useIRIS) opts.setOCREngine('iris');
97          opts.addLang('deu');
98
99          const ignore_zones = [];
100          ignore_zones.push(new PDFNet.Rect(424, 163, 493, 730));
101          opts.addIgnoreZonesForPage(ignore_zones, 1);
102
103          // C) Run OCR on the .pdf with options
104          await PDFNet.OCRModule.processPDF(doc, opts);
105
106          // D) check the result
107          await doc.save(output_path + 'german_kids_song.pdf', 0);
108
109          console.log('Example 3: german_kids_song.pdf');
110        } catch (err) {
111          console.log(err);
112        }
113
114        //--------------------------------------------------------------------------------
115        // Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
116        try {
117          // A) Setup empty destination doc
118          const doc = await PDFNet.PDFDoc.create();
119          await doc.initSecurityHandler();
120
121          // B) Setup options with a single language plus text/ignore zones
122          const opts = new PDFNet.OCRModule.OCROptions();
123          if(useIRIS) opts.setOCREngine('iris');
124          opts.addLang('eng');
125
126          var ignore_zones = [];
127          // ignore signature box in the first 2 pages
128          ignore_zones.push(new PDFNet.Rect(1492, 56, 2236, 432));
129          opts.addIgnoreZonesForPage(ignore_zones, 1);
130
131          ignore_zones = [];
132          ignore_zones.push(new PDFNet.Rect(1492, 56, 2236, 432));
133          opts.addIgnoreZonesForPage(ignore_zones, 2);
134
135          // can use a combination of ignore and text boxes to focus on the page area of interest,
136          // as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
137          ignore_zones = [];
138          ignore_zones.push(new PDFNet.Rect(992, 1276, 1368, 1372));
139          opts.addIgnoreZonesForPage(ignore_zones, 3);
140
141
142          const text_zones = [];
143          // we only have text zones selected in page 3
144
145          // select horizontal BUFFER ZONE sign
146          text_zones.push(new PDFNet.Rect(900, 2384, 1236, 2480));
147          // select right vertical BUFFER ZONE sign
148          text_zones.push(new PDFNet.Rect(1960, 1976, 2016, 2296));
149          // select Lot No.
150          text_zones.push(new PDFNet.Rect(696, 1028, 1196, 1128));
151
152          // select part of the plan inside the BUFFER ZONE
153          text_zones.push(new PDFNet.Rect(428, 1484, 1784, 2344));
154          text_zones.push(new PDFNet.Rect(948, 1288, 1672, 1476));
155          opts.addTextZonesForPage(text_zones, 3);
156
157          // C) Run OCR on the .tif with options
158          await PDFNet.OCRModule.imageToPDF(doc, input_path + 'bc_environment_protection.tif', opts);
159
160          // D) check the result
161          await doc.save(output_path + 'bc_environment_protection.pdf', 0);
162
163          console.log('Example 4: bc_environment_protection.tif');
164        } catch (err) {
165          console.log(err);
166        }
167
168        //--------------------------------------------------------------------------------
169        // Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
170        // out special characters), and finally applying modified OCR JSON to the source PDF document 
171        try {
172          // A) Open the .pdf document
173          const doc = await PDFNet.PDFDoc.createFromFilePath(input_path + 'zero_value_test_no_text.pdf');
174          await doc.initSecurityHandler();
175
176          const opts = new PDFNet.OCRModule.OCROptions();
177          if(useIRIS) opts.setOCREngine('iris');
178
179          // B) Run OCR on the .pdf with default English language
180          const json = await PDFNet.OCRModule.getOCRJsonFromPDF(doc, opts);
181
182          // C) Post-processing step (whatever it might be)
183          console.log('Have OCR result JSON, re-applying to PDF ');
184
185          // D) Apply potentially modified OCR JSON to the PDF
186          await PDFNet.OCRModule.applyOCRJsonToPDF(doc, json);
187
188          // E) Check the result
189          await doc.save(output_path + 'zero_value_test_no_text.pdf', 0);
190
191          console.log('Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf');
192        } catch (err) {
193          console.log(err);
194        }
195
196        //--------------------------------------------------------------------------------
197        // Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
198        try {
199
200          // A) Setup empty destination doc
201          const doc = await PDFNet.PDFDoc.create();
202          await doc.initSecurityHandler();
203
204          const opts = new PDFNet.OCRModule.OCROptions();
205          if(useIRIS) opts.setOCREngine('iris');
206
207          // B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
208          // in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
209          const xml = await PDFNet.OCRModule.getOCRXmlFromImage(doc, input_path + 'physics.tif', opts);
210
211          // C) Post-processing step (whatever it might be)
212          console.log('Have OCR result XML, re-applying to PDF');
213
214          // D) Apply potentially modified OCR XML to the PDF
215          await PDFNet.OCRModule.applyOCRXmlToPDF(doc, xml);
216
217          // E) Check the result
218          await doc.save(output_path + 'physics.pdf', 0);
219
220          console.log('Example 6: extracting and applying OCR XML from physics.tif');
221        } catch (err) {
222          console.log(err);
223        }
224        console.log('Done.');
225      } catch (err) {
226        console.log(err);
227      }
228    };
229    PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function(error) {
230      console.log('Error: ' + JSON.stringify(error));
231    }).then(function(){ return PDFNet.shutdown(); });
232  };
233  exports.runOCRTest();
234})(exports);
235// eslint-disable-next-line spaced-comment
236//# sourceURL=OCRTest.js

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/OCR/";
12$output_path = getcwd()."/../../TestFiles/Output/";
13
14//---------------------------------------------------------------------------------------
15// The following sample illustrates how to use OCR module
16//---------------------------------------------------------------------------------------
17	
18	// The first step in every application using PDFNet is to initialize the 
19	// library and set the path to common PDF resources. The library is usually 
20	// initialized only once, but calling Initialize() multiple times is also fine.
21	PDFNet::Initialize($LicenseKey);
22	PDFNet::GetSystemFontList();    // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
23
24	// The location of the OCR Module
25	PDFNet::AddResourceSearchPath("../../../Lib/");
26
27	// If the IRIS OCR module is available, will use that instead of the default
28	$use_iris = OCRModule::IsIRISModuleAvailable();
29	if(!OCRModule::IsModuleAvailable()) {
30		echo "Unable to run OCRTest: PDFTron SDK OCR module not available.\n
31			---------------------------------------------------------------\n
32			The OCR module is an optional add-on, available for download\n
33			at https://dev.apryse.com/. If you have already downloaded this\n
34			module, ensure that the SDK is able to find the required files\n
35			using the PDFNet::AddResourceSearchPath() function.\n";
36	} else
37	{
38		//--------------------------------------------------------------------------------
39		// Example 1) Process image 
40		// A) Setup empty destination doc
41
42		$doc = new PDFDoc();
43
44		// B) Use the IRIS OCR engine if available
45
46		$opts = new OCROptions();
47		if ($use_iris) {
48		    $opts->SetOCREngine("iris");
49		}
50
51		// C) Run OCR on the .png with options
52		OCRModule::ImageToPDF($doc, $input_path."psychomachia_excerpt.png", $opts);
53
54		// D) Check the result
55
56		$doc->Save($output_path."psychomachia_excerpt.pdf", 0);
57
58		echo "Example 1: psychomachia_excerpt.png \n";
59
60
61		//--------------------------------------------------------------------------------
62		// Example 2) Process document using multiple languages
63	 
64		// A) Setup empty destination doc
65		
66		$doc = new PDFDoc();
67
68		// B) Setup options with multiple target languages, English will always be considered as secondary language
69
70		$opts = new OCROptions();
71		if ($use_iris) {
72		    $opts->SetOCREngine("iris");
73		}
74		$opts->AddLang("deu");
75		$opts->AddLang("fra");
76		$opts->AddLang("eng");
77
78		// B) Run OCR on the .png with options
79
80		OCRModule::ImageToPDF($doc, $input_path."multi_lang.jpg", $opts);
81
82		// C) check the result
83
84		$doc->Save($output_path."multi_lang.pdf", 0);
85
86		echo "Example 2: multi_lang.jpg \n";
87
88
89		//--------------------------------------------------------------------------------
90		// Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
91		
92		// A) Open the .pdf document
93		
94		$doc = new PDFDoc($input_path."german_kids_song.pdf");
95
96		// B) Setup options with a single language and an ignore zone
97
98		$opts = new OCROptions();
99		if ($use_iris) {
100		    $opts->SetOCREngine("iris");
101		}
102		$opts->AddLang("deu");
103
104		$ignore_zones = new RectCollection();
105		$rect = new Rect(424.0, 163.0, 493.0, 730.0);
106		$ignore_zones->AddRect($rect);
107		$opts->AddIgnoreZonesForPage($ignore_zones, 1);
108
109		// C) Run OCR on the .pdf with options
110
111		OCRModule::ProcessPDF($doc, $opts);
112
113		// D) check the result
114
115		$doc->Save($output_path."german_kids_song.pdf", 0);
116
117		echo "Example 3: german_kids_song.pdf \n";
118
119		//--------------------------------------------------------------------------------
120		// Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
121		
122		// A) Setup empty destination doc
123		
124		$doc = new PDFDoc();
125
126		// B) Setup options with a single language plus text/ignore zones
127
128		$opts = new OCROptions();
129		if ($use_iris) {
130		    $opts->SetOCREngine("iris");
131		}
132		$opts->AddLang("eng");
133
134		$ignore_zones = new RectCollection();
135		// ignore signature box in the first 2 pages
136		$ignore_zones->AddRect(new Rect(1492.0, 56.0, 2236.0, 432.0));
137		$opts->AddIgnoreZonesForPage($ignore_zones, 1);
138		$opts->AddIgnoreZonesForPage($ignore_zones, 2);
139
140		// can use a combination of ignore and text boxes to focus on the page area of interest,
141		// as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
142		$ignore_zones->Clear();
143		$ignore_zones->AddRect(new Rect(992.0, 1276.0, 1368.0, 1372.0));
144		$opts->AddIgnoreZonesForPage($ignore_zones, 3);
145
146
147		$text_zones = new RectCollection();
148		// we only have text zones selected in page 3
149
150		// select horizontal BUFFER ZONE sign
151		$text_zones->AddRect(new Rect(900.0, 2384.0, 1236.0, 2480.0));
152		// select right vertical BUFFER ZONE sign
153		$text_zones->AddRect(new Rect(1960.0, 1976.0, 2016.0, 2296.0));
154		// select Lot No.
155		$text_zones->AddRect(new Rect(696.0, 1028.0, 1196.0, 1128.0));
156
157		// select part of the plan inside the BUFFER ZONE
158		$text_zones->AddRect(new Rect(428.0, 1484.0, 1784.0, 2344.0));
159		$text_zones->AddRect(new Rect(948.0, 1288.0, 1672.0, 1476.0));
160		$opts->AddTextZonesForPage($text_zones, 3);
161
162		// C) Run OCR on the .pdf with options
163
164		OCRModule::ImageToPDF($doc, $input_path."bc_environment_protection.tif", $opts);
165
166		// D) check the result
167
168		$doc->Save($output_path."bc_environment_protection.pdf", 0);
169
170		echo "Example 4: bc_environment_protection.tif \n";
171
172
173		//--------------------------------------------------------------------------------
174		// Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
175		// out special characters), and finally applying modified OCR JSON to the source PDF document 
176		// A) Setup empty destination doc
177		
178		$doc = new PDFDoc($input_path."zero_value_test_no_text.pdf");
179
180		// B) Use the IRIS OCR engine if available
181
182		$opts = new OCROptions();
183		if ($use_iris) {
184				$opts->SetOCREngine("iris");
185		}
186
187		// C) Run OCR on the .pdf with default English language
188
189		$json = OCRModule::GetOCRJsonFromPDF($doc, $opts);
190
191		// D) Post-processing step (whatever it might be)
192
193		echo "Have OCR result JSON, re-applying to PDF \n";
194
195		OCRModule::ApplyOCRJsonToPDF($doc, $json);
196
197		// E) check the result
198
199		$doc->Save($output_path."zero_value_test_no_text.pdf", 0);
200
201		echo "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf \n";
202
203
204		//--------------------------------------------------------------------------------
205		// Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
206		
207		// A) Setup empty destination doc
208
209		$doc = new PDFDoc();
210
211		// B) Use the IRIS OCR engine if available
212
213		$opts = new OCROptions();
214		if ($use_iris) {
215				$opts->SetOCREngine("iris");
216		}
217
218		// C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
219		// in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
220
221		$xml = OCRModule::GetOCRXmlFromImage($doc, $input_path."physics.tif", $opts);
222
223		// D) Post-processing step (whatever it might be)
224
225		echo "Have OCR result XML, re-applying to PDF \n";
226
227		OCRModule::ApplyOCRXmlToPDF($doc, $xml);
228
229		// E) check the result
230
231		$doc->Save($output_path."physics.pdf", 0);
232
233		echo "Example 6: extracting and applying OCR XML from physics.tif \n";
234
235		echo "Done. \n";
236	}
237	
238	PDFNet::Terminate();
239
240?>

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from apryse_sdk import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14# Relative path to the folder containing test files.
15input_path = "../../TestFiles/OCR/"
16output_path = "../../TestFiles/Output/"
17
18# ---------------------------------------------------------------------------------------
19# The following sample illustrates how to use OCR module
20# --------------------------------------------------------------------------------------
21
22def main():
23
24    # The first step in every application using PDFNet is to initialize the
25    # library and set the path to common PDF resources. The library is usually
26    # initialized only once, but calling Initialize() multiple times is also fine.
27    PDFNet.Initialize(LicenseKey)
28
29    iris_installed = True # Set to True if the IRIS OCR module is installed and you wish to use it
30
31    # The location of the OCR Module
32    if iris_installed:
33        PDFNet.AddResourceSearchPath("../../../IRISOCRModuleWindows/Lib/")
34    else:
35        PDFNet.AddResourceSearchPath("../../../OCRModuleWindows/Lib/")
36    
37    use_iris = OCRModule.IsIRISModuleAvailable()
38    
39    if not OCRModule.IsModuleAvailable():
40
41        print("""
42        Unable to run OCRTest: PDFTron SDK OCR module not available.
43        ---------------------------------------------------------------
44        The OCR module is an optional add-on, available for download
45        at https://dev.apryse.com/. If you have already downloaded this
46        module, ensure that the SDK is able to find the required files
47        using the PDFNet::AddResourceSearchPath() function.""")
48
49    else:
50
51        # Example 1) Process image
52        # --------------------------------------------------------------------------------
53
54        # A) Setup empty destination doc
55        doc = PDFDoc()
56
57        # B) Set English as the language of choice
58        opts = OCROptions()
59        if use_iris: opts.SetOCREngine("iris")
60        opts.AddLang("eng")
61
62        # C) Run OCR on the .png with options
63        OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts)
64
65        # D) Check the result
66        doc.Save(output_path + "psychomachia_excerpt.pdf", 0)
67
68        print("Example 1: psychomachia_excerpt.png")
69
70        # Example 2) Process document using multiple languages
71        # --------------------------------------------------------------------------------
72
73        # A) Setup empty destination doc
74        doc = PDFDoc()
75
76        # B) Setup options with multiple target languages, English will always be considered as secondary language
77        opts = OCROptions()
78        if use_iris: opts.SetOCREngine("iris")
79        opts.AddLang("deu")
80        opts.AddLang("fra")
81        opts.AddLang("eng")
82
83        # C) Run OCR on the .jpg with options
84        OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)
85
86        # D) Check the result
87        doc.Save(output_path + "multi_lang.pdf", 0)
88
89        print("Example 2: multi_lang.jpg")
90
91        # Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
92        # --------------------------------------------------------------------------------
93
94        # A) Open the .pdf document
95        doc = PDFDoc(input_path + "german_kids_song.pdf")
96
97        # B) Setup options with a single language and an ignore zone
98        opts = OCROptions()
99        if use_iris: opts.SetOCREngine("iris")
100        opts.AddLang("deu")
101
102        ignore_zones = RectCollection()
103        ignore_zones.AddRect(Rect(424, 163, 493, 730))
104        opts.AddIgnoreZonesForPage(ignore_zones, 1)
105
106        # C) Run OCR on the .pdf with options
107        OCRModule.ProcessPDF(doc, opts)
108
109        # D) check the result
110        doc.Save(output_path + "german_kids_song.pdf", 0)
111
112        print("Example 3: german_kids_song.pdf")
113
114        # Example 4) Process multi-page tiff with text/ignore zones specified for each page,
115        # --------------------------------------------------------------------------------
116
117        # A) Setup empty destination doc
118
119        doc = PDFDoc()
120        # B) Setup options with a single language plus text/ignore zones
121
122        opts = OCROptions()
123        if use_iris: opts.SetOCREngine("iris")
124        opts.AddLang("eng")
125
126        ignore_zones = RectCollection()
127
128        # ignore signature box in the first 2 pages
129        ignore_zones.AddRect(Rect(1492, 56, 2236, 432))
130        opts.AddIgnoreZonesForPage(ignore_zones, 1)
131        opts.AddIgnoreZonesForPage(ignore_zones, 2)
132
133        # can use a combination of ignore and text boxes to focus on the page area of interest,
134        # as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
135        ignore_zones.Clear()
136        ignore_zones.AddRect(Rect(992, 1276, 1368, 1372))
137        opts.AddIgnoreZonesForPage(ignore_zones, 3)
138
139        text_zones = RectCollection()
140        # we only have text zones selected in page 3
141
142        # select horizontal BUFFER ZONE sign
143        text_zones.AddRect(Rect(900, 2384, 1236, 2480))
144
145        # select right vertical BUFFER ZONE sign
146        text_zones.AddRect(Rect(1960, 1976, 2016, 2296))
147        # select Lot No.
148        text_zones.AddRect(Rect(696, 1028, 1196, 1128))
149
150        # select part of the plan inside the BUFFER ZONE
151        text_zones.AddRect(Rect(428, 1484, 1784, 2344))
152        text_zones.AddRect(Rect(948, 1288, 1672, 1476))
153        opts.AddTextZonesForPage(text_zones, 3)
154
155        # C) Run OCR on the .pdf with options
156        OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)
157
158        # D) check the result
159        doc.Save(output_path + "bc_environment_protection.pdf", 0)
160
161        print("Example 4: bc_environment_protection.tif")
162
163        # Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
164        # (e.g., removing words not in the dictionary or filtering special
165        # out special characters), and finally applying modified OCR JSON to the source PDF document
166        # --------------------------------------------------------------------------------
167
168        # A) Open the .pdf document
169        doc = PDFDoc(input_path + "zero_value_test_no_text.pdf")
170        
171        # B) set English language
172        opts = OCROptions()
173        if use_iris: opts.SetOCREngine("iris")
174        opts.AddLang("eng")
175
176        # C) Run OCR on the .pdf
177        json = OCRModule.GetOCRJsonFromPDF(doc, opts)
178
179        # D) Post-processing step (whatever it might be)
180        print("Have OCR result JSON, re-applying to PDF")
181
182        # E) Apply potentially modified OCR JSON to the PDF
183        OCRModule.ApplyOCRJsonToPDF(doc, json)
184
185        # F) Check the result
186        doc.Save(output_path + "zero_value_test_no_text.pdf", 0)
187
188        print("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
189
190        # Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
191        # similar to the one used by TextExtractor
192        # --------------------------------------------------------------------------------
193
194        # A) Setup empty destination doc
195        doc = PDFDoc()
196
197        # B) set English language
198        opts = OCROptions()
199        if use_iris: opts.SetOCREngine("iris")
200        opts.AddLang("eng")
201
202        # C) Run OCR on the .tif with English language, extracting OCR results in XML format. Note that
203        # in the process we convert the source image into PDF.
204        # We reuse this PDF document later to add hidden text layer to it.
205
206        xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts)
207
208        # D) Post-processing step (whatever it might be)
209        print("Have OCR result XML, re-applying to PDF")
210
211        # E) Apply potentially modified OCR XML to the PDF
212        OCRModule.ApplyOCRXmlToPDF(doc, xml)
213
214        # F) Check the result
215        doc.Save(output_path + "physics.pdf", 0)
216
217        print("Example 6: extracting and applying OCR XML from physics.tif")
218
219        PDFNet.Terminate()
220
221if __name__ == '__main__':
222    main()

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12# Relative path to the folder containing test files.
13input_path =  "../../TestFiles/OCR/"
14output_path = "../../TestFiles/Output/"
15
16#---------------------------------------------------------------------------------------
17# The following sample illustrates how to use OCR module
18#---------------------------------------------------------------------------------------
19
20# The first step in every application using PDFNet is to initialize the 
21# library and set the path to common PDF resources. The library is usually 
22# initialized only once, but calling Initialize multiple times is also fine.
23PDFNet.Initialize(PDFTronLicense.Key)
24
25# The location of the OCR Module
26PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/");
27
28#Example 1) Convert the first page to PNG and TIFF at 92 DPI.
29
30begin
31
32   # if the IRIS OCR module is available, will use that instead of the default
33   use_iris = OCRModule.IsIRISModuleAvailable
34   if !OCRModule.IsModuleAvailable
35      puts 'Unable to run OCRTest: PDFTron SDK OCR module not available.'
36      puts '---------------------------------------------------------------'
37      puts 'The OCR module is an optional add-on, available for download'
38      puts 'at https://dev.apryse.com/. If you have already downloaded this'
39      puts 'module, ensure that the SDK is able to find the required files'
40      puts 'using the PDFNet::AddResourceSearchPath() function.'
41
42   else
43
44      # Example 1) Process image with specifying options, IRIS OCR module and English as the language of choice
45      # --------------------------------------------------------------------------------
46
47      # A) Setup empty destination doc
48      doc = PDFDoc.new
49
50      # B) Setup options with:
51      opts = OCROptions.new
52
53      # B.1. IRIS OCR module, if available
54      if use_iris
55         opts.SetOCREngine("iris")
56      end
57
58      # B.2. English as the language of choice
59      opts.AddLang("eng")
60
61      # C) Run OCR on the .png with options
62      OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts)
63
64      # D) Check the result
65      doc.Save(output_path + "psychomachia_excerpt.pdf", 0)
66      puts "Example 1: psychomachia_excerpt.png"
67
68      doc.Close
69
70      # Example 2) Process document using multiple languages
71      # --------------------------------------------------------------------------------
72
73      # A) Setup empty destination doc
74      doc = PDFDoc.new
75
76      # B) Setup options with:
77      opts = OCROptions.new
78
79      # B.1. IRIS OCR module, if available
80      if use_iris
81         opts.SetOCREngine("iris")
82      end
83
84      # B.2. multiple target languages, English will always be considered as secondary language
85      opts.AddLang("deu")
86      opts.AddLang("fra")
87      opts.AddLang("eng")
88
89      # C) Run OCR on the .jpg with options
90      OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)
91
92      # D) Check the result
93      doc.Save(output_path + "multi_lang.pdf", 0)
94      puts "Example 2: multi_lang.jpg"
95
96      doc.Close
97
98      # Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image
99      # --------------------------------------------------------------------------------
100
101      # A) Open the .pdf document
102      doc = PDFDoc.new(input_path + "german_kids_song.pdf")
103
104      # B) Setup options with:
105      opts = OCROptions.new
106
107      # B.1. IRIS OCR module, if available
108      if use_iris
109         opts.SetOCREngine("iris")
110      end
111
112      # B.2. German as the language of choice
113      opts.AddLang("deu")
114
115      # B.3. ignore zone comprising a sidebar image
116      ignore_zones = RectCollection.new
117      ignore_zones.AddRect(Rect.new(424, 163, 493, 730))
118      opts.AddIgnoreZonesForPage(ignore_zones, 1)
119
120      # C) Run OCR on the .pdf with options
121      OCRModule.ProcessPDF(doc, opts)
122
123      # D) check the result
124      doc.Save(output_path + "german_kids_song.pdf", 0)
125      puts "Example 3: german_kids_song.pdf"
126
127      doc.Close
128
129      # Example 4) Process multi-page tiff with text/ignore zones specified for each page,
130      # optionally provide English as the target language
131      # --------------------------------------------------------------------------------
132
133      # A) Setup empty destination doc
134      doc = PDFDoc.new
135
136      # B) Setup options with:
137      opts = OCROptions.new
138
139      # B.1. IRIS OCR module, if available
140      if use_iris
141         opts.SetOCREngine("iris")
142      end
143
144      # B.2. English as the language of choice
145      opts.AddLang("eng")
146
147      # B.3 text/ignore zones
148      ignore_zones = RectCollection.new
149
150      # ignore signature box in the first 2 pages
151      ignore_zones.AddRect(Rect.new(1492, 56, 2236, 432))
152      opts.AddIgnoreZonesForPage(ignore_zones, 1)
153
154      opts.AddIgnoreZonesForPage(ignore_zones, 2)
155
156      # can use a combination of ignore and text boxes to focus on the page area of interest,
157      # as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
158      ignore_zones.Clear
159      ignore_zones.AddRect(Rect.new(992, 1276, 1368, 1372))
160      opts.AddIgnoreZonesForPage(ignore_zones, 3)
161
162      text_zones = RectCollection.new
163      # we only have text zones selected in page 3
164
165      # select horizontal BUFFER ZONE sign
166      text_zones.AddRect(Rect.new(900, 2384, 1236, 2480))
167
168      # select right vertical BUFFER ZONE sign
169      text_zones.AddRect(Rect.new(1960, 1976, 2016, 2296))
170      # select Lot No.
171      text_zones.AddRect(Rect.new(696, 1028, 1196, 1128))
172
173      # select part of the plan inside the BUFFER ZONE
174      text_zones.AddRect(Rect.new(428, 1484, 1784, 2344))
175      text_zones.AddRect(Rect.new(948, 1288, 1672, 1476))
176      opts.AddTextZonesForPage(text_zones, 3)
177
178      # C) Run OCR on the .pdf with options
179      OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)
180
181      # D) check the result
182      doc.Save(output_path + "bc_environment_protection.pdf", 0)
183      puts "Example 4: bc_environment_protection.tif"
184
185      doc.Close
186
187      # Example 5) Alternative workflow for extracting OCR result JSON, postprocessing
188      # (e.g., removing words not in the dictionary or filtering special
189      # out special characters), and finally applying modified OCR JSON to the source PDF document
190      # --------------------------------------------------------------------------------
191
192      # A) Open the .pdf document
193      doc = PDFDoc.new(input_path + "zero_value_test_no_text.pdf")
194
195      # B) Setup options with:
196      opts = OCROptions.new
197
198      # B.1. IRIS OCR module, if available
199      if use_iris
200         opts.SetOCREngine("iris")
201      end
202
203      # B.2. English as the language of choice
204      opts.AddLang("eng")
205
206      # C) Run OCR on the .pdf with options
207      json = OCRModule.GetOCRJsonFromPDF(doc, opts)
208
209      # D) Post-processing step (whatever it might be)
210      puts "Have OCR result JSON, re-applying to PDF"
211      OCRModule.ApplyOCRJsonToPDF(doc, json)
212
213      # E) Check the result
214      doc.Save(output_path + "zero_value_test_no_text.pdf", 0)
215      puts "Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf"
216
217      doc.Close
218
219      # Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format,
220      # similar to the one used by TextExtractor
221      # --------------------------------------------------------------------------------
222
223      # A) Setup empty destination doc
224      doc = PDFDoc.new
225
226      # B) Setup options with:
227      opts = OCROptions.new
228
229      # B.1. IRIS OCR module, if available
230      if use_iris
231         opts.SetOCREngine("iris")
232      end
233
234      # B.2. English as the language of choice
235      opts.AddLang("eng")
236
237      # C) Run OCR on the .tif with options, extracting OCR results in XML format. Note that
238      # in the process we convert the source image into PDF.
239      # We reuse this PDF document later to add hidden text layer to it.
240      xml = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts)
241
242      # D) Post-processing step (whatever it might be)
243      puts "Have OCR result XML, re-applying to PDF"
244      OCRModule.ApplyOCRXmlToPDF(doc, xml)
245
246      # E) Check the result
247      doc.Save(output_path + "physics.pdf", 0)
248      puts "Example 6: extracting and applying OCR XML from physics.tif"
249
250      doc.Close
251
252      # Example 7) Resolution can be manually set, when DPI missing from metadata or is wrong
253      # --------------------------------------------------------------------------------
254
255      # A) Setup empty destination doc
256      doc = PDFDoc.new
257
258      # B) Setup options with:
259      opts = OCROptions.new
260
261      # B.1. IRIS OCR module, if available
262      if use_iris
263         opts.SetOCREngine("iris")
264      end
265
266      # B.2. text zone
267      text_zones = RectCollection.new
268      text_zones.AddRect(Rect.new(140, 870, 310, 920))
269      opts.AddIgnoreZonesForPage(text_zones, 1)
270
271      # B.3 Manually override DPI
272      opts.AddDPI(100)
273
274      # C) Run OCR on the .jpg with options
275      OCRModule.ImageToPDF(doc, input_path + "corrupted_dpi.jpg", opts)
276
277      # D) Check the result
278      doc.Save(output_path + "corrupted_dpi.pdf", 0)
279      puts "Example 7: converting image with corrupted resolution metadata corrupted_dpi.jpg to pdf with searchable text"
280
281      doc.Close
282
283   end
284   rescue Exception=>e
285      puts e
286
287end
288PDFNet.Terminate
289

1'---------------------------------------------------------------------------------------
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3' Consult legal.txt regarding legal and license information.     
4'---------------------------------------------------------------------------------------
5Imports System
6
7Imports pdftron
8Imports pdftron.Common
9Imports pdftron.SDF
10Imports pdftron.PDF
11
12' <summary>
13'---------------------------------------------------------------------------------------
14' The following sample illustrates how to use OCR module
15'---------------------------------------------------------------------------------------
16' </summary>
17Module OCRTestVB
18    Dim pdfNetLoader As PDFNetLoader
19    Sub New()
20        pdfNetLoader = pdftron.PDFNetLoader.Instance()
21    End Sub
22
23    ' The main entry point for the application.
24    Sub Main()
25
26        ' The first step in every application using PDFNet is to initialize the 
27        ' library and set the path to common PDF resources. The library is usually 
28        ' initialized only once, but calling Initialize() multiple times is also fine.
29        PDFNet.Initialize(PDFTronLicense.Key)
30
31        ' Can optionally set path to the OCR module
32        PDFNet.AddResourceSearchPath("../../../../../Lib/")
33
34        Dim useOCR As Boolean = OCRModule.IsIRISModuleAvailable()
35        If Not OCRModule.IsModuleAvailable() Then
36            Console.WriteLine("")
37            Console.WriteLine("Unable to run OCRTest: Apryse SDK OCR module not available.")
38            Console.WriteLine("---------------------------------------------------------------")
39            Console.WriteLine("The OCR module is an optional add-on, available for download")
40            Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#ocr-module . If you have already downloaded this")
41            Console.WriteLine("module, ensure that the SDK is able to find the required files")
42            Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
43            Console.WriteLine("")
44            Return
45        End If
46
47        ' Relative path to the folder containing test files.
48        Dim input_path As String = "../../../../TestFiles/OCR/"
49        Dim output_path As String = "../../../../TestFiles/Output/"
50
51        '--------------------------------------------------------------------------------
52        ' Example 1) Process image
53        Try
54            ' A) Setup empty destination doc.
55            Using doc As PDFDoc = New PDFDoc()
56
57                ' B) Set English as the language of choice
58                Dim opts As OCROptions = New OCROptions()
59                If useOCR Then opts.SetOCREngine("iris")
60                opts.AddLang("eng")
61
62                ' C) Run OCR on the .png with options
63                OCRModule.ImageToPDF(doc, input_path + "psychomachia_excerpt.png", opts)
64
65                ' D) Check the result
66                doc.Save(output_path + "psychomachia_excerpt.pdf", SDFDoc.SaveOptions.e_remove_unused)
67
68                Console.WriteLine("Example 1: psychomachia_excerpt.png")
69
70            End Using
71        Catch e As PDFNetException
72            Console.WriteLine(e.Message)
73        End Try
74
75        '--------------------------------------------------------------------------------
76        ' Example 2) Process document using multiple languages
77        Try
78            ' A) Setup empty destination doc.
79            Using doc As PDFDoc = New PDFDoc()
80
81                ' B) Setup options with multiple target languages, English will always be considered as secondary language
82                Dim opts As OCROptions = New OCROptions()
83                If useOCR Then opts.SetOCREngine("iris")
84                opts.AddLang("deu")
85                opts.AddLang("fra")
86                opts.AddLang("eng")
87
88                ' C) Run OCR on the .jpg with options
89                OCRModule.ImageToPDF(doc, input_path + "multi_lang.jpg", opts)
90
91                ' D) Check the result
92                doc.Save(output_path + "multi_lang.pdf", SDFDoc.SaveOptions.e_remove_unused)
93
94                Console.WriteLine("Example 2: multi_lang.jpg")
95
96            End Using
97        Catch e As PDFNetException
98            Console.WriteLine(e.Message)
99        End Try
100
101
102        '--------------------------------------------------------------------------------
103        ' Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
104        Try
105            ' A) Open the .pdf document.
106            Using doc As PDFDoc = New PDFDoc(input_path + "german_kids_song.pdf")
107
108                ' B) Setup options with a single language and an ignore zone
109                Dim opts As OCROptions = New OCROptions()
110                If useOCR Then opts.SetOCREngine("iris")
111                opts.AddLang("deu")
112
113                Dim zones As RectCollection = New RectCollection()
114                zones.AddRect(424, 163, 493, 730)
115                opts.AddIgnoreZonesForPage(zones, 1)
116
117                ' C) Run OCR on the .pdf with options
118                OCRModule.ProcessPDF(doc, opts)
119
120                ' D) Check the result
121                doc.Save(output_path + "german_kids_song.pdf", SDFDoc.SaveOptions.e_remove_unused)
122
123                Console.WriteLine("Example 3: german_kids_song.pdf")
124
125            End Using
126        Catch e As PDFNetException
127            Console.WriteLine(e.Message)
128        End Try
129
130        '--------------------------------------------------------------------------------
131        ' Example 4) Process multipage tiff with text/ignore zones specified for each page
132        Try
133            ' A) Setup empty destination doc.
134            Using doc As PDFDoc = New PDFDoc()
135
136                ' B) Setup options with a single language plus text/ignore zones
137                Dim opts As OCROptions = New OCROptions()
138                If useOCR Then opts.SetOCREngine("iris")
139                opts.AddLang("eng")
140
141                Dim zones As RectCollection = New RectCollection()
142
143                ' ignore Signature box in the first 2 pages
144                zones.AddRect(1492, 56, 2236, 432)
145                opts.AddIgnoreZonesForPage(zones, 1)
146                zones.Clear()
147                
148                zones.AddRect(1492, 56, 2236, 432)
149                opts.AddIgnoreZonesForPage(zones, 2)
150                zones.Clear()
151
152                ' can use a combination of ignore And text boxes to focus on the page area of interest,
153                ' as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
154                zones.AddRect(992, 1276, 1368, 1372)
155                opts.AddIgnoreZonesForPage(zones, 3)
156                zones.Clear()
157                ' we only have text zones selected in page 3
158
159
160                ' select horizontal BUFFER ZONE sign
161                zones.AddRect(900, 2384, 1236, 2480)
162                ' select right vertical BUFFER ZONE sign
163                zones.AddRect(1960, 1976, 2016, 2296)
164                ' select Lot No.
165                zones.AddRect(696, 1028, 1196, 1128)
166
167                ' select part of the plan inside the BUFFER ZONE
168                zones.AddRect(428, 1484, 1784, 2344)
169                zones.AddRect(948, 1288, 1672, 1476)
170                opts.AddIgnoreZonesForPage(zones, 3)
171
172                ' C) Run OCR on the .pdf with options
173                OCRModule.ImageToPDF(doc, input_path + "bc_environment_protection.tif", opts)
174
175                ' D) Check the result
176                doc.Save(output_path + "bc_environment_protection.pdf", SDFDoc.SaveOptions.e_remove_unused)
177
178                Console.WriteLine("Example 4: bc_environment_protection.tif")
179
180            End Using
181        Catch e As PDFNetException
182            Console.WriteLine(e.Message)
183        End Try
184
185        '--------------------------------------------------------------------------------
186        ' Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words Not in the dictionary Or filtering special
187        ' out special characters), And finally applying modified OCR JSON to the source PDF document 
188        Try
189            ' A) Open the .pdf document.
190            Using doc As PDFDoc = New PDFDoc(input_path + "zero_value_test_no_text.pdf")
191
192                ' B) Set English as the language of choice
193                Dim opts As OCROptions = New OCROptions()
194                If useOCR Then opts.SetOCREngine("iris")
195                opts.AddLang("eng")
196
197                ' C) Run OCR on the .pdf 
198                Dim json As String = OCRModule.GetOCRJsonFromPDF(doc, opts)
199
200                ' D) Post-processing step (whatever it might be), but we just print JSON here
201                Console.WriteLine("Have OCR result JSON, re-applying to PDF")
202
203                ' E) Apply potentially modified OCR JSON to the PDF
204                OCRModule.ApplyOCRJsonToPDF(doc, json)
205
206                ' F) Check the result
207                doc.Save(output_path + "zero_value_test_no_text.pdf", SDFDoc.SaveOptions.e_remove_unused)
208
209                Console.WriteLine("Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf")
210
211            End Using
212        Catch e As PDFNetException
213            Console.WriteLine(e.Message)
214        End Try
215
216        '--------------------------------------------------------------------------------
217        ' Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
218        Try
219            ' A) Setup empty destination doc.
220            Using doc As PDFDoc = New PDFDoc()
221
222                ' B) Set English as the language of choice
223                Dim opts As OCROptions = New OCROptions()
224                If useOCR Then opts.SetOCREngine("iris")
225                opts.AddLang("eng")
226
227                ' C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
228                ' in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.
229                Dim xml As String = OCRModule.GetOCRXmlFromImage(doc, input_path + "physics.tif", opts)
230
231                ' D) Post-processing step (whatever it might be), but we just print XML here
232                Console.WriteLine("Have OCR result XML, re-applying to PDF")
233
234                ' E) Apply potentially modified OCR XML to the PDF
235                OCRModule.ApplyOCRXmlToPDF(doc, xml)
236
237                ' F) Check the result
238                doc.Save(output_path + "physics.pdf", SDFDoc.SaveOptions.e_remove_unused)
239
240                Console.WriteLine("Example 6: extracting and applying OCR XML from physics.tif")
241
242            End Using
243        Catch e As PDFNetException
244            Console.WriteLine(e.Message)
245        End Try
246
247        PDFNet.Terminate()
248    End Sub
249
250End Module

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

OCR to search PDFs and Extract Text - Ruby Sample Code

Implementation steps