Some test text!

Search
Hamburger Icon

Obj-C use OCR to make searchable PDFs and extract text

More languages

More languages
C++
C#
C# (.NET Core)
Go
Java
Obj-C
JS (Node.js)
PHP
Python
Ruby
VB

Sample Obj-C code shows how to use the PDFTron OCR module on scanned documents in multiple languages. The OCR module can make searchable PDFs and extract scanned text for further indexing. Learn more about our Obj-C PDF Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------

#import <OBJC/PDFNetOBJC.h>
#import <Foundation/Foundation.h>

//---------------------------------------------------------------------------------------
// The following sample illustrates how to use OCR module
//---------------------------------------------------------------------------------------
int main(int argc, char *argv[])
{
    @autoreleasepool {
        @try
        {
            // The first step in every application using PDFNet is to initialize the 
            // library and set the path to common PDF resources. The library is usually 
            // initialized only once, but calling Initialize() multiple times is also fine.
            [PTPDFNet Initialize: 0];

			//--------------------------------------------------------------------------------
			// Example 1) Process image without specifying options, default language - English - is used
			
            @try 
            {
                // A) Setup empty destination doc
                
				PTPDFDoc * doc = [[PTPDFDoc alloc] init];

				// B) Run OCR on the .png with options

				NSString * img = @"../../TestFiles/psychomachia_excerpt.png";
				
				[PTOCRModule ImageToPDF: doc src: img options: nil];

				// C) check the result

                NSString * outputFile = @"../../TestFiles/Output/psychomachia_excerpt.pdf";
				[doc SaveToFile: outputFile flags: e_ptremove_unused];

				NSLog(@"Example 1 psychomachia_excerpt.png\n");
            }
            @catch(NSException *e)	
            {
                NSLog(@"%@", e.reason);
            }

			//--------------------------------------------------------------------------------
			// Example 2) Process document using multiple languages
		
            @try 
            {
                // A) Setup empty destination doc
                
				PTPDFDoc * doc = [[PTPDFDoc alloc] init];

				// B) Setup options with multiple target languages, English will always be considered as secondary language

				PTObjSet * set = [[PTObjSet alloc] init];
				PTObj * options = [set CreateDict];
				PTObj * lang_array = [options PutArray: @"Langs"];
				[lang_array PushBackString: @"rus"];
				[lang_array PushBackString: @"deu"];

				// C) Run OCR on the .jpg with options

				NSString * img = @"../../TestFiles/multi_lang.jpg";

				[PTOCRModule ImageToPDF: doc src: img options: options];

				// D) check the result

                NSString * outputFile = @"../../TestFiles/Output/multi_lang.pdf";
				[doc SaveToFile: outputFile flags: e_ptremove_unused];

				NSLog(@"Example 2: multi_lang.jpg\n");
            }
            @catch(NSException *e)	
            {
                NSLog(@"%@", e.reason);
            }

			//--------------------------------------------------------------------------------
			// Example 3) Process a .pdf specifying a language - German - and ignore zone comprising a sidebar image 
			@try 
            {
                // A) Open the .pdf document
                
				PTPDFDoc * doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/german_kids_song.pdf"];

			    // B) Setup options with a single language and an ignore zone

				PTObjSet * set = [[PTObjSet alloc] init];
				PTObj * options = [set CreateDict];
				PTObj * lang_array = [options PutArray: @"Langs"];
				[lang_array PushBackString: @"deu"];

				PTObj * ignore_zones = [options PutArray: @"IgnoreZones"];
				PTObj * ignore_zones_page1 = [ignore_zones PushBackArray];
				[ignore_zones_page1 PushBackRect: 1768 y1: 680 x2: 2056 y2: 3044 ];

			    // C) Run OCR on the .pdf with options

				[PTOCRModule ProcessPDF: doc options: options];

				// D) check the result

                NSString * outputFile = @"../../TestFiles/Output/german_kids_song.pdf";
				[doc SaveToFile: outputFile flags: e_ptremove_unused];

				NSLog(@"Example 3: german_kids_song.pdf\n");
            }
            @catch(NSException *e)	
            {
                NSLog(@"%@", e.reason);
            }


			//--------------------------------------------------------------------------------
			// Example 4) Process multipage tiff with text/ignore zones specified for each page, optionally provide English as the target language
			@try 
            {
				// A) Setup empty destination doc
                
				PTPDFDoc * doc = [[PTPDFDoc alloc] init];

				// B) Setup options with a single language plus text/ignore zones

				PTObjSet * set = [[PTObjSet alloc] init];
				PTObj * options = [set CreateDict];
				PTObj * lang_array = [options PutArray: @"Langs"];
				[lang_array PushBackString: @"eng"];

				PTObj * ignore_zones = [options PutArray: @"IgnoreZones"];
				PTObj * ignore_zones_page1 = [ignore_zones PushBackArray];
				PTObj * ignore_zones_page2 = [ignore_zones PushBackArray];
				PTObj * ignore_zones_page3 = [ignore_zones PushBackArray];

				// ignore signature box in the first 2 pages
				[ignore_zones_page1 PushBackRect: 1492 y1: 56 x2: 2236 y2: 432 ];
				[ignore_zones_page2 PushBackRect: 1492 y1: 56 x2: 2236 y2: 432 ];

				// can use a combination of ignore and text boxes to focus on the page area of interest,
				// as ignore boxes are applied first, we remove the arrows before selecting part of the diagram
				[ignore_zones_page3 PushBackRect: 992 y1: 1276 x2: 1368 y2: 1372 ];

				PTObj * text_zones = [options PutArray: @"TextZones"];

				// we only have text zones selected in page 3, so text_zones_page1 and text_zones_page2 will be empty
				PTObj * text_zones_page1 = [text_zones PushBackArray];
				PTObj * text_zones_page2 = [text_zones PushBackArray];
				PTObj * text_zones_page3 = [text_zones PushBackArray];

				// select horizontal BUFFER ZONE sign
				[text_zones_page3 PushBackRect: 900 y1: 2384 x2: 1236 y2: 2480 ];

				// select right vertical BUFFER ZONE sign
				[text_zones_page3 PushBackRect: 1960 y1: 1976 x2: 2016 y2: 2296 ];

				// select Lot No.
				[text_zones_page3 PushBackRect: 696 y1: 1028 x2: 1196 y2: 1128 ];

				// select part of the plan inside the BUFFER ZONE
				[text_zones_page3 PushBackRect: 428 y1: 1484 x2: 1784 y2: 2344 ];
				[text_zones_page3 PushBackRect: 948 y1: 1288 x2: 1672 y2: 1476 ];

			    // C) Run OCR on the .pdf with options

				NSString * img = @"../../TestFiles/bc_environment_protection.tif";

				[PTOCRModule ImageToPDF: doc src: img options: options];

				// D) check the result

                NSString * outputFile = @"../../TestFiles/Output/bc_environment_protection.pdf";
				[doc SaveToFile: outputFile flags: e_ptremove_unused];

				NSLog(@"Example 4: bc_environment_protection.tif\n");
            }
            @catch(NSException *e)	
            {
                NSLog(@"%@", e.reason);
            }

			//--------------------------------------------------------------------------------
			// Example 5) Alternative workflow for extracting OCR result JSON, postprocessing (e.g., removing words not in the dictionary or filtering special
			// out special characters), and finally applying modified OCR JSON to the source PDF document 
			@try 
            {
				// A) Open the .pdf document
                
				PTPDFDoc * doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/zero_value_test_no_text.pdf"];

				// B) Run OCR on the .pdf with default English languages

				NSString * json = [PTOCRModule GetOCRJsonFromPDF: doc options: nil];

				// C) Post-processing step (whatever it might be), but we just print json here
				
			    NSLog(@"OCR result JSON: %@", json);

				// D) Apply potentially modified OCR JSON to the PDF

				[PTOCRModule ApplyOCRJsonToPDF: doc json: json];

				// E) Check the result

                NSString * outputFile = @"../../TestFiles/Output/zero_value_test_no_text.pdf";
				[doc SaveToFile: outputFile flags: e_ptremove_unused];

				NSLog(@"Example 5: extracting and applying OCR JSON from zero_value_test_no_text.pdf\n");
            }
            @catch(NSException *e)	
            {
                NSLog(@"%@", e.reason);
            }

		//--------------------------------------------------------------------------------
		// Example 6) The postprocessing workflow has also an option of extracting OCR results in XML format, similar to the one used by TextExtractor
		@try 
		{

			// A) Setup empty destination doc

			PTPDFDoc * doc = [[PTPDFDoc alloc] init];

			// B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
			// in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.

			NSString * img = @"../../TestFiles/physics.tif";

			NSString * xml = [PTOCRModule GetOCRXmlFromImage: doc src: img options: nil];

			// C) Post-processing step (whatever it might be), but we just print XML here

			NSLog(@"OCR result XML: %@", xml);

			// D) Apply potentially modified OCR XML to the PDF

			[PTOCRModule ApplyOCRXmlToPDF: doc xml: xml];

			// E) Check the result

			NSString * outputFile = @"../../TestFiles/Output/physics.pdf";
			[doc SaveToFile: outputFile flags: e_ptremove_unused];

			NSLog(@"Example 6: extracting and applying OCR XML from physics.tif\n");

		}
		@catch(NSException *e)	
        {
            NSLog(@"%@", e.reason);
        }

        }
        @catch(NSException *e)	
        {
            NSLog(@"%@", e.reason);
        }

        return 0;
    }
}