Search PDF for Text / String - TextSearch

Sample code for using Apryse SDK to search text on PDF pages using regular expressions; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. The TextSearch utility class builds on functionality available in TextExtractor Sample to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using pdftron;
7using pdftron.Common;
8using pdftron.Filters;
9using pdftron.SDF;
10using pdftron.PDF;
11
12
13namespace TextSearchTestCS
14{
15	// This sample illustrates various text search capabilities of PDFNet.
16
17	class Class1
18	{		
19		private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
20		static Class1() {}
21		
22		static void Main(string[] args)
23		{
24			PDFNet.Initialize(PDFTronLicense.Key);
25
26			// Relative path to the folder containing test files.
27			string input_path =  "../../../../TestFiles/";
28
29			// Sample code showing how to use high-level text extraction APIs.
30			try	
31			{
32				using (PDFDoc doc = new PDFDoc(input_path + "credit card numbers.pdf"))
33				{
34					doc.InitSecurityHandler();
35
36					Int32 page_num = 0;
37					String result_str = "", ambient_string = "";
38					Highlights hlts = new Highlights();
39
40					TextSearch txt_search = new TextSearch();
41					Int32 mode = (Int32)(TextSearch.SearchMode.e_whole_word | TextSearch.SearchMode.e_page_stop | TextSearch.SearchMode.e_highlight);
42					String pattern = "joHn sMiTh";
43
44					//call Begin() method to initialize the text search.
45					txt_search.Begin( doc, pattern, mode, -1, -1 );
46
47					int step = 0;
48			
49					//call Run() method iteratively to find all matching instances.
50					while ( true )
51					{
52						TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts );
53
54						if ( code == TextSearch.ResultCode.e_found )
55						{
56							if ( step == 0 )
57							{	//step 0: found "John Smith"
58								//note that, here, 'ambient_string' and 'hlts' are not written to, 
59								//as 'e_ambient_string' and 'e_highlight' are not set.
60								Console.WriteLine(result_str + "'s credit card number is: ");
61
62								//now switch to using regular expressions to find John's credit card number
63								mode = txt_search.GetMode();
64								mode |= (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_highlight);
65								txt_search.SetMode(mode);
66								pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
67								txt_search.SetPattern(pattern);
68
69								++step;
70							}
71							else if ( step == 1 )
72							{
73								//step 1: found John's credit card number
74								//result_str.ConvertToAscii(char_buf, 32, true);
75								//cout << "  " << char_buf << endl;
76								Console.WriteLine("  " + result_str);
77
78								//note that, here, 'hlts' is written to, as 'e_highlight' has been set.
79								//output the highlight info of the credit card number
80								hlts.Begin(doc);
81								while (hlts.HasNext())
82								{
83									Console.WriteLine("The current highlight is from page: " + hlts.GetCurrentPageNumber());
84									hlts.Next();
85								}
86
87								//see if there is an AMEX card number
88								pattern = "\\d{4}-\\d{6}-\\d{5}";
89								txt_search.SetPattern(pattern);
90
91								++step;
92							}
93							else if ( step == 2 )
94							{
95								//found an AMEX card number
96								Console.WriteLine("\nThere is an AMEX card number:\n  " + result_str);
97
98								//change mode to find the owner of the credit card; supposedly, the owner's
99								//name proceeds the number
100								mode = txt_search.GetMode();
101								mode |= (Int32)(TextSearch.SearchMode.e_search_up);
102								txt_search.SetMode(mode);
103								pattern = "[A-z]++ [A-z]++";
104								txt_search.SetPattern(pattern);
105
106								++step;
107							}
108							else if ( step == 3 )
109							{
110								//found the owner's name of the AMEX card
111								Console.WriteLine("Is the owner's name:\n  " + result_str + "?");
112
113								//add a link annotation based on the location of the found instance
114								hlts.Begin(doc);
115								while (hlts.HasNext())
116								{
117									Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
118									double[] quads = hlts.GetCurrentQuads();
119									int quad_count = quads.Length / 8;
120									for (int i = 0; i < quad_count; ++i)
121									{
122										//assume each quad is an axis-aligned rectangle
123										int offset = 8 * i;
124										double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
125										double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
126										double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
127										double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
128
129										pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.Create(doc, new Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc, "http://www.pdftron.com"));
130										hyper_link.RefreshAppearance();
131										cur_page.AnnotPushBack(hyper_link);
132									}
133									hlts.Next();
134								}
135								string output_path = "../../../../TestFiles/Output/";
136								doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc.SaveOptions.e_linearized);
137
138								break;
139							}
140						}
141						else if ( code == TextSearch.ResultCode.e_page )
142						{
143							//you can update your UI here, if needed
144						}
145						else
146						{
147							break;
148						}
149					}
150				}
151			}
152
153			catch (PDFNetException e)
154			{
155				Console.WriteLine(e.Message);
156			}
157			PDFNet.Terminate();
158		}
159	}
160}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6// This sample shows how to use pdftron.PDF.TextSearch to search text on PDF pages
7// using regular expressions. TextSearch utility class builds on functionality 
8// available in TextExtractor to simplify most common search operations.
9
10#include <PDF/PDFNet.h>
11#include <PDF/PDFDoc.h>
12#include <PDF/TextSearch.h>
13#include <PDF/Annot.h>
14#include <iostream>
15#include "../../LicenseKey/CPP/LicenseKey.h"
16
17using namespace std;
18using namespace pdftron;
19using namespace PDF;
20using namespace SDF;
21using namespace Common;
22
23#undef max
24#undef min
25#include <algorithm>
26
27int main(int argc, char *argv[])
28{
29	int ret = 0;
30	PDFNet::Initialize(LicenseKey);
31	std::string input_path =  "../../TestFiles/credit card numbers.pdf";
32	const char* filein = argc>1 ? argv[1] : input_path.c_str();
33
34	try
35	{
36		PDFDoc doc(filein);
37		doc.InitSecurityHandler();
38
39		TextSearch txt_search;
40		TextSearch::Mode mode = TextSearch::e_whole_word | TextSearch::e_page_stop;
41		UString pattern( "joHn sMiTh" );
42
43		//call Begin() method to initialize the text search.
44		txt_search.Begin( doc, pattern, mode );
45
46		int step = 0;
47	
48		//call Run() method iteratively to find all matching instances.
49		while ( true )
50		{
51			SearchResult result = txt_search.Run();
52
53			if ( result )
54			{
55				if ( step == 0 )
56				{	// Step 0: found "John Smith"
57					// note that, here, 'ambient_string' and 'hlts' are not written to, 
58					// as 'e_ambient_string' and 'e_highlight' are not set.
59
60					cout << result.GetMatch() << "'s credit card number is: " << endl;
61
62					//now switch to using regular expressions to find John's credit card number
63					mode = txt_search.GetMode();
64					mode |= TextSearch::e_reg_expression | TextSearch::e_highlight;
65					txt_search.SetMode(mode);
66					pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
67					txt_search.SetPattern(pattern);
68
69					++step;
70				}
71				else if ( step == 1 )
72				{
73					//step 1: found John's credit card number
74					cout << "  " << result.GetMatch() << endl;
75
76					//note that, here, 'hlts' is written to, as 'e_highlight' has been set.
77					//output the highlight info of the credit card number.
78					Highlights hlts = result.GetHighlights();
79					hlts.Begin(doc);
80					while ( hlts.HasNext() )
81					{
82						cout << "The current highlight is from page: " << hlts.GetCurrentPageNumber() << endl;
83						hlts.Next();
84					}
85
86					//see if there is an AMEX card number
87					pattern = "\\d{4}-\\d{6}-\\d{5}";
88					txt_search.SetPattern(pattern);
89
90					++step;
91				}
92				else if ( step == 2 )
93				{
94					//found an AMEX card number
95					cout << "\nThere is an AMEX card number:\n  " << result.GetMatch() << endl;
96
97					//change mode to find the owner of the credit card; supposedly, the owner's
98					//name proceeds the number
99					mode = txt_search.GetMode();
100					mode |= TextSearch::e_search_up;
101					txt_search.SetMode(mode);
102					pattern = "[A-z]++ [A-z]++";
103					txt_search.SetPattern(pattern);
104
105					++step;
106				}
107				else if ( step == 3 )
108				{
109					//found the owner's name of the AMEX card
110					cout << "Is the owner's name:\n  " << result.GetMatch() << "?\n" << flush;
111
112					//add a link annotation based on the location of the found instance
113					Highlights hlts = result.GetHighlights();
114					hlts.Begin(doc);
115					while ( hlts.HasNext() )
116					{
117						Page cur_page= doc.GetPage(hlts.GetCurrentPageNumber());
118						const double *quads;
119						int quad_count = hlts.GetCurrentQuads(quads);
120						for ( int i = 0; i < quad_count; ++i )
121						{
122							//assume each quad is an axis-aligned rectangle
123							const double *q = &quads[8*i];
124							double x1 = min(min(min(q[0], q[2]), q[4]), q[6]);
125							double x2 = max(max(max(q[0], q[2]), q[4]), q[6]);
126							double y1 = min(min(min(q[1], q[3]), q[5]), q[7]);
127							double y2 = max(max(max(q[1], q[3]), q[5]), q[7]);
128							Annots::Link hyper_link = Annots::Link::Create(doc, Rect(x1, y1, x2, y2), Action::CreateURI(doc, "http://www.pdftron.com"));
129							cur_page.AnnotPushBack(hyper_link);
130						}
131						hlts.Next();
132					}
133					std::string output_path = "../../TestFiles/Output/";
134					doc.Save((output_path + "credit card numbers_linked.pdf").c_str(), SDFDoc::e_linearized, 0);
135					break;
136				}
137			}
138			else if ( result.IsPageEnd() )
139			{
140				//you can update your UI here, if needed
141			}
142			else  
143			{
144				assert (result.IsDocEnd());
145				break;
146			}
147		}
148	}
149	catch(Exception& e)
150	{
151		cout << e << endl;
152		ret = 1;
153	}
154	catch(...)
155	{
156		cout << "Unknown Exception" << endl;
157		ret = 1;
158	}
159
160	PDFNet::Terminate();
161	return ret;
162}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8	"fmt"
9	"strconv"
10	. "math"
11	. "pdftron"
12)
13
14import  "pdftron/Samples/LicenseKey/GO"
15
16// This sample illustrates the basic text search capabilities of PDFNet.
17
18// Relative path to the folder containing the test files.
19var inputPath = "../../TestFiles/"
20var outputPath = "../../TestFiles/Output/"
21
22func main(){
23    // Initialize PDFNet
24    PDFNetInitialize(PDFTronLicense.Key)
25    doc := NewPDFDoc(inputPath + "credit card numbers.pdf")
26    doc.InitSecurityHandler()
27    
28    txtSearch := NewTextSearch()
29    mode := TextSearchE_whole_word | TextSearchE_page_stop
30    
31    pattern := "joHn sMiTh"
32    
33    // call Begin() method to initialize the text search.
34    txtSearch.Begin(doc, pattern, uint(mode))
35
36    step := 0
37    
38    // call Run() method iteratively to find all matching instances.
39    for true{
40        searchResult := txtSearch.Run()
41        if searchResult.IsFound(){
42            if step == 0{
43                // step 0: found "John Smith"
44                // note that, here, 'ambient_string' and 'hlts' are not written to, 
45                // as 'e_ambient_string' and 'e_highlight' are not set.
46                
47                fmt.Println(searchResult.GetMatch() + "'s credit card number is: ")
48                // now switch to using regular expressions to find John's credit card number
49                mode := PdftronPDFTextSearchTextSearchModes(txtSearch.GetMode())
50                mode = mode | TextSearchE_reg_expression | TextSearchE_highlight
51                txtSearch.SetMode(uint(mode))
52                pattern := "\\d{4}-\\d{4}-\\d{4}-\\d{4}"     //or "(\\d{4}-){3}\\d{4}"
53                txtSearch.SetPattern(pattern)
54                step = step + 1
55            }else if step == 1{
56                // step 1: found John's credit card number
57                fmt.Println("  " + searchResult.GetMatch())
58                
59                // note that, here, 'hlts' is written to, as 'e_highligh' has been set.
60                // output the highlight info of the credit card number
61                hlts := searchResult.GetHighlights()
62                hlts.Begin(doc)
63                for hlts.HasNext(){
64                    fmt.Println("The current highlight is from page: " + strconv.Itoa(hlts.GetCurrentPageNumber()))
65                    hlts.Next()
66                }
67                // see if there is an AMEX card number
68                pattern := "\\d{4}-\\d{6}-\\d{5}"
69                txtSearch.SetPattern(pattern)
70                
71                step = step + 1
72            }else if step == 2{
73                // found an AMEX card number
74                fmt.Println("\nThere is an AMEX card number:\n  " + searchResult.GetMatch())
75                
76                // change mode to find the owner of the credit card; supposedly, the owner's
77                // name proceeds the number
78                mode := PdftronPDFTextSearchTextSearchModes(txtSearch.GetMode())
79                mode = mode | TextSearchE_search_up
80                txtSearch.SetMode(uint(mode))
81                pattern := "[A-z]++ [A-z]++"
82                txtSearch.SetPattern(pattern)
83                step = step + 1
84            }else if step == 3{
85                // found the owner's name of the AMEX card
86                fmt.Println("Is the owner's name:\n  " + searchResult.GetMatch() + "?")
87                
88                // add a link annotation based on the location of the found instance
89                hlts := searchResult.GetHighlights()
90                hlts.Begin(doc)
91                
92                for hlts.HasNext(){
93                    curPage := doc.GetPage(uint(hlts.GetCurrentPageNumber()))
94                    quadsInfo := hlts.GetCurrentQuads()
95                    
96                    i := 0
97                    for i < int(quadsInfo.Size()){
98                        q := quadsInfo.Get(i)
99                        // assume each quad is an axis-aligned rectangle 
100                        x1 := Min(Min(Min(q.GetP1().GetX(), q.GetP2().GetX()), q.GetP3().GetX()), q.GetP4().GetX())
101                        x2 := Max(Max(Max(q.GetP1().GetX(), q.GetP2().GetX()), q.GetP3().GetX()), q.GetP4().GetX())
102                        y1 := Min(Min(Min(q.GetP1().GetY(), q.GetP2().GetY()), q.GetP3().GetY()), q.GetP4().GetY())
103                        y2 := Max(Max(Max(q.GetP1().GetY(), q.GetP2().GetY()), q.GetP3().GetY()), q.GetP4().GetY())
104                        hyperLink := LinkCreate(doc.GetSDFDoc(), NewRect(x1, y1, x2, y2), ActionCreateURI(doc.GetSDFDoc(), "http://www.pdftron.com"))
105                        curPage.AnnotPushBack(hyperLink)
106                        i = i + 1
107					}
108                    hlts.Next()
109				}
110                doc.Save(outputPath + "credit card numbers_linked.pdf", uint(SDFDocE_linearized))
111                break
112			}
113        }else if searchResult.IsPageEnd(){
114            //you can update your UI here, if needed
115        }else{
116            break
117		}
118    }    
119    doc.Close()
120    PDFNetTerminate()
121}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.common.PDFNetException;
7import com.pdftron.pdf.*;
8import com.pdftron.sdf.SDFDoc;
9
10// This sample illustrates the basic text search capabilities of PDFNet.
11public class TextSearchTest {
12
13    public static void main(String[] args) {
14        PDFNet.initialize(PDFTronLicense.Key());
15        String input_path = "../../TestFiles/";
16
17        try (PDFDoc doc = new PDFDoc(input_path + "credit card numbers.pdf")) {
18            doc.initSecurityHandler();
19
20            TextSearch txt_search = new TextSearch();
21            int mode = TextSearch.e_whole_word | TextSearch.e_page_stop;
22
23            String pattern = "joHn sMiTh";
24
25            //PDFDoc doesn't allow simultaneous access from different threads. If this
26            //document could be used from other threads (e.g., the rendering thread inside
27            //PDFView/PDFViewCtrl, if used), it is good practice to lock it.
28            //Notice: don't forget to call doc.Unlock() to avoid deadlock.
29            doc.lock();
30
31            //call Begin() method to initialize the text search.
32            txt_search.begin(doc, pattern, mode, -1, -1);
33
34            int step = 0;
35
36            //call Run() method iteratively to find all matching instances.
37            while (true) {
38                TextSearchResult result = txt_search.run();
39
40                if (result.getCode() == TextSearchResult.e_found) {
41                    if (step == 0) {
42                        //step 0: found "John Smith"
43                        //note that, here, 'ambient_string' and 'hlts' are not written to,
44                        //as 'e_ambient_string' and 'e_highlight' are not set.
45                        System.out.println(result.getResultStr() + "'s credit card number is: ");
46
47                        //now switch to using regular expressions to find John's credit card number
48                        mode = txt_search.getMode();
49                        mode |= TextSearch.e_reg_expression | TextSearch.e_highlight;
50                        txt_search.setMode(mode);
51                        String new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
52                        txt_search.setPattern(new_pattern);
53
54                        step = step + 1;
55                    } else if (step == 1) {
56                        //step 1: found John's credit card number
57                        System.out.println("  " + result.getResultStr());
58
59                        //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
60                        //output the highlight info of the credit card number
61                        Highlights hlts = result.getHighlights();
62                        hlts.begin(doc);
63                        while (hlts.hasNext()) {
64                            System.out.println("The current highlight is from page: " + hlts.getCurrentPageNumber());
65                            hlts.next();
66                        }
67
68                        //see if there is an AMEX card number
69                        String new_pattern = "\\d{4}-\\d{6}-\\d{5}";
70                        txt_search.setPattern(new_pattern);
71
72                        step = step + 1;
73                    } else if (step == 2) {
74                        //found an AMEX card number
75                        System.out.println("\nThere is an AMEX card number:");
76                        System.out.println("  " + result.getResultStr());
77
78                        //change mode to find the owner of the credit card; supposedly, the owner's
79                        //name proceeds the number
80                        mode = txt_search.getMode();
81                        mode |= TextSearch.e_search_up;
82                        txt_search.setMode(mode);
83                        String new_pattern = "[A-z]++ [A-z]++";
84                        txt_search.setPattern(new_pattern);
85
86                        step = step + 1;
87                    } else if (step == 3) {
88                        //found the owner's name of the AMEX card
89                        System.out.println("Is the owner's name:");
90                        System.out.println("  " + result.getResultStr() + "?");
91
92                        //add a link annotation based on the location of the found instance
93                        Highlights hlts = result.getHighlights();
94                        hlts.begin(doc);
95                        while (hlts.hasNext()) {
96                            Page cur_page = doc.getPage(hlts.getCurrentPageNumber());
97                            double[] q = hlts.getCurrentQuads();
98                            int quad_count = q.length / 8;
99                            for (int i = 0; i < quad_count; ++i) {
100                                //assume each quad is an axis-aligned rectangle
101                                int offset = 8 * i;
102                                double x1 = Math.min(Math.min(Math.min(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
103                                double x2 = Math.max(Math.max(Math.max(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
104                                double y1 = Math.min(Math.min(Math.min(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
105                                double y2 = Math.max(Math.max(Math.max(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
106                                com.pdftron.pdf.annots.Link hyper_link = com.pdftron.pdf.annots.Link.create(doc, new Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.pdftron.com"));
107                                cur_page.annotPushBack(hyper_link);
108                            }
109                            hlts.next();
110                        }
111                        String output_path = "../../TestFiles/Output/";
112                        doc.save(output_path + "credit card numbers_linked.pdf", SDFDoc.SaveMode.LINEARIZED, null);
113                        // output PDF doc
114                        break;
115                    }
116                } else if (result.getCode() == TextSearchResult.e_page) {
117                    //you can update your UI here, if needed
118                } else {
119                    break;
120                }
121            }
122
123            doc.unlock();
124        } catch (PDFNetException e) {
125            System.out.println(e);
126        }
127
128        PDFNet.terminate();
129    }
130}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11
12  exports.runTextSearchTest = () => {
13
14    const main = async() => {
15      // Relative path to the folder containing test files.
16      const inputURL = '../TestFiles/';
17      const inputFilename = 'credit card numbers.pdf'; // addimage.pdf, newsletter.pdf
18
19      try {
20        const doc = await PDFNet.PDFDoc.createFromFilePath(inputURL + inputFilename);
21        doc.initSecurityHandler();
22
23        const txtSearch = await PDFNet.TextSearch.create();
24        let mode = PDFNet.TextSearch.Mode.e_whole_word + PDFNet.TextSearch.Mode.e_page_stop; // Uses both whole word and page stop
25        let pattern = 'joHn sMiTh';
26
27        txtSearch.begin(doc, pattern, mode); // searches for the "pattern" in the document while following the inputted modes.
28
29        let step = 0;
30
31        // call Run() iteratively to find all matching instances of the word 'joHn sMiTh'
32        /* eslint-disable-next-line no-constant-condition */
33        while (true) {
34          const result = await txtSearch.run();
35          let hlts;
36          if (result.code === PDFNet.TextSearch.ResultCode.e_found) {
37            if (step === 0) { // Step 0: found "John Smith"
38              // note that, here, 'ambient_str' and 'highlights' are not written to,
39              // as 'e_ambient_string' and 'e_highlight' are not set.
40              console.log(result.out_str + "'s credit card number is: ");
41
42              // now switch to using regular expressions to find John's credit card number
43              mode = await txtSearch.getMode();
44              mode += PDFNet.TextSearch.Mode.e_reg_expression + PDFNet.TextSearch.Mode.e_highlight;
45              txtSearch.setMode(mode);
46              pattern = '\\d{4}-\\d{4}-\\d{4}-\\d{4}'; // or "(\\d{4}-){3}\\d{4}"
47              txtSearch.setPattern(pattern);
48
49              ++step;
50            } else if (step === 1) {
51              // step 1: found John's credit card number
52              console.log('  ' + result.out_str);
53              // note that, here, 'hlts' is written to, as 'e_highlight' has been set.
54              // output the highlight info of the credit card number.
55              hlts = result.highlights;
56              hlts.begin(doc);
57              while ((await hlts.hasNext())) {
58                const highlightPageNum = await hlts.getCurrentPageNumber();
59                console.log('The current highlight is from page: ' + highlightPageNum);
60                await hlts.next();
61              }
62              // see if there is an AMEX card number
63              pattern = '\\d{4}-\\d{6}-\\d{5}';
64              txtSearch.setPattern(pattern);
65
66              ++step;
67            } else if (step === 2) {
68              // found an AMEX card number
69              console.log('\nThere is an AMEX card number:\n  ' + result.out_str);
70
71              // change mode to find the owner of the credit card; supposedly, the owner's
72              // name proceeds the number
73              mode = await txtSearch.getMode();
74              mode += PDFNet.TextSearch.Mode.e_search_up;
75              txtSearch.setMode(mode);
76              pattern = '[A-z]++ [A-z]++';
77              txtSearch.setPattern(pattern);
78
79              ++step;
80            } else if (step === 3) {
81              // found the owner's name of the AMEX card
82              console.log("Is the owner's name:\n  " + result.out_str + '?');
83
84              // add a link annotation based on the location of the found instance
85              hlts = result.highlights;
86              await hlts.begin(doc); // is await needed?
87              while ((await hlts.hasNext())) {
88                const curPage = await doc.getPage((await hlts.getCurrentPageNumber()));
89                const quadArr = await hlts.getCurrentQuads();
90                for (let i = 0; i < quadArr.length; ++i) {
91                  const currQuad = quadArr[i];
92                  const x1 = Math.min(Math.min(Math.min(currQuad.p1x, currQuad.p2x), currQuad.p3x), currQuad.p4x);
93                  const x2 = Math.max(Math.max(Math.max(currQuad.p1x, currQuad.p2x), currQuad.p3x), currQuad.p4x);
94                  const y1 = Math.min(Math.min(Math.min(currQuad.p1y, currQuad.p2y), currQuad.p3y), currQuad.p4y);
95                  const y2 = Math.max(Math.max(Math.max(currQuad.p1y, currQuad.p2y), currQuad.p3y), currQuad.p4y);
96
97                  const hyperLink = await PDFNet.LinkAnnot.create(doc, (await PDFNet.Rect.init(x1, y1, x2, y2)));
98                  await hyperLink.setAction((await PDFNet.Action.createURI(doc, 'http://www.pdftron.com')));
99                  await curPage.annotPushBack(hyperLink);
100                }
101                hlts.next();
102              }
103              await doc.save('../TestFiles/Output/credit card numbers_linked.pdf', PDFNet.SDFDoc.SaveOptions.e_linearized);
104              break;
105            }
106          } else if (result.code === PDFNet.TextSearch.ResultCode.e_page) {
107            // you can update your UI here, if needed
108            console.log('page end');
109          } else if (result.code === PDFNet.TextSearch.ResultCode.e_done) {
110            break;
111          }
112        }
113      } catch (err) {
114        console.log(err);
115      }
116    };
117    PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function(error){console.log('Error: ' + JSON.stringify(error));}).then(function(){return PDFNet.shutdown();});
118  };
119  exports.runTextSearchTest();
120})(exports);
121// eslint-disable-next-line spaced-comment
122//# sourceURL=TextSearchTest.js

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/";
12$output_path = $input_path."Output/";
13
14	PDFNet::Initialize($LicenseKey);
15	PDFNet::GetSystemFontList();    // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
16
17	$doc = new PDFDoc($input_path."credit card numbers.pdf");
18	$doc->InitSecurityHandler();
19
20	$txt_search = new TextSearch();
21	$mode = TextSearch::e_whole_word | TextSearch::e_page_stop;
22	$pattern = "joHn sMiTh";
23
24	//call Begin() method to initialize the text search.
25	$txt_search->Begin( $doc, $pattern, $mode );
26
27	$step = 0;
28	
29	//call Run() method iteratively to find all matching instances.
30	while ( true )
31	{
32		$searchResult = $txt_search->Run();
33		if ( $searchResult->IsFound() )
34		{
35			if ( $step == 0 )
36			{	//step 0: found "John Smith"
37				//note that, here, 'ambient_string' and 'hlts' are not written to, 
38				//as 'e_ambient_string' and 'e_highlight' are not set.
39
40				echo nl2br($searchResult->GetMatch()."'s credit card number is: \n");
41
42				//now switch to using regular expressions to find John's credit card number
43				$mode = $txt_search->GetMode();
44				$mode |= TextSearch::e_reg_expression | TextSearch::e_highlight;
45				$txt_search->SetMode($mode);
46				$pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
47				$txt_search->SetPattern($pattern);
48
49				++$step;
50			}
51			else if ( $step == 1 )
52			{
53				//step 1: found John's credit card number
54				echo nl2br("  ".$searchResult->GetMatch()."\n");
55				
56				//note that, here, 'hlts' is written to, as 'e_highlight' has been set.
57				//output the highlight info of the credit card number.
58				$hlts = $searchResult->GetHighlights();
59				$hlts->Begin($doc);
60				while ( $hlts->HasNext() )
61				{
62					echo nl2br("The current highlight is from page: ".$hlts->GetCurrentPageNumber()."\n");
63					$hlts->Next();
64				}
65
66				//see if there is an AMEX card number
67				$pattern = "\\d{4}-\\d{6}-\\d{5}";
68				$txt_search->SetPattern($pattern);
69
70				++$step;
71			}
72			else if ( $step == 2 )
73			{
74				//found an AMEX card number
75				echo nl2br("\nThere is an AMEX card number:\n  ".$searchResult->GetMatch()."\n");
76
77				//change mode to find the owner of the credit card; supposedly, the owner's
78				//name proceeds the number
79				$mode = $txt_search->GetMode();
80				$mode |= TextSearch::e_search_up;
81				$txt_search->SetMode($mode);
82				$pattern = "[A-z]++ [A-z]++";
83				$txt_search->SetPattern($pattern);
84
85				++$step;
86			}
87			else if ( $step == 3 )
88			{
89				//found the owner's name of the AMEX card
90				echo nl2br("Is the owner's name:\n  ".$searchResult->GetMatch()."?\n");
91
92				//add a link annotation based on the location of the found instance
93				$hlts = $searchResult->GetHighlights();
94				$hlts->Begin($doc);
95				while ( $hlts->HasNext() )
96				{
97					$cur_page= $doc->GetPage($hlts->GetCurrentPageNumber());
98					$quadsInfo = $hlts->GetCurrentQuads();
99
100					for ( $i = 0; $i < $quadsInfo->size(); ++$i )
101					{
102						//assume each quad is an axis-aligned rectangle
103						$q = $quadsInfo->get($i);
104						$x1 = min(min(min($q->p1->x, $q->p2->x), $q->p3->x), $q->p4->x);
105						$x2 = max(max(max($q->p1->x, $q->p2->x), $q->p3->x), $q->p4->x);
106						$y1 = min(min(min($q->p1->y, $q->p2->y), $q->p3->y), $q->p4->y);
107						$y2 = max(max(max($q->p1->y, $q->p2->y), $q->p3->y), $q->p4->y);
108						$hyper_link = Link::CreateAnnot($doc->GetSDFDoc(), new Rect($x1, $y1, $x2, $y2), 
109										Action::CreateURI($doc->GetSDFDoc(), "http://www.pdftron.com"));
110						$cur_page->AnnotPushBack($hyper_link);
111					}
112					$hlts->Next();
113				}
114				
115				$doc->Save($output_path."credit card numbers_linked.pdf", SDFDoc::e_linearized);
116
117				break;
118			}
119		}
120		else if ( $code == TextSearch::e_page )
121		{
122			//you can update your UI here, if needed
123		}
124		else
125		{
126			break;
127		}
128	}
129	
130	$doc->Close();	
131	PDFNet::Terminate();
132?>

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14# This sample illustrates the basic text search capabilities of PDFNet.
15
16# Relative path to the folder containing the test files.
17input_path = "../../TestFiles/"
18output_path = "../../TestFiles/Output/"
19
20def main():
21    # Initialize PDFNet
22    PDFNet.Initialize(LicenseKey)
23    doc = PDFDoc(input_path + "credit card numbers.pdf")
24    doc.InitSecurityHandler()
25    
26    txt_search = TextSearch()
27    mode = TextSearch.e_whole_word | TextSearch.e_page_stop
28    
29    pattern = "joHn sMiTh"
30    
31    # call Begin() method to initialize the text search.
32    txt_search.Begin(doc, pattern, mode)
33
34    step = 0
35    
36    # call Run() method iteratively to find all matching instances.
37    while True:
38        searchResult = txt_search.Run()
39        if searchResult.IsFound():
40            if step == 0:
41                # step 0: found "John Smith"
42                # note that, here, 'ambient_string' and 'hlts' are not written to, 
43                # as 'e_ambient_string' and 'e_highlight' are not set.
44                
45                print(str(searchResult.GetMatch()) + "'s credit card number is: ")
46                
47                # now switch to using regular expressions to find John's credit card number
48                mode = txt_search.GetMode()
49                mode |= TextSearch.e_reg_expression | TextSearch.e_highlight
50                txt_search.SetMode(mode)
51                pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"     #or "(\\d{4}-){3}\\d{4}"
52                txt_search.SetPattern(pattern)
53                step = step + 1
54            elif step == 1:
55                # step 1: found John's credit card number
56                print("  " + searchResult.GetMatch())
57                
58                # note that, here, 'hlts' is written to, as 'e_highligh' has been set.
59                # output the highlight info of the credit card number
60                hlts = searchResult.GetHighlights()
61                hlts.Begin(doc)
62                while hlts.HasNext():
63                    print("The current highlight is from page: " + str(hlts.GetCurrentPageNumber()))
64                    hlts.Next()
65                    
66                # see if there is an AMEX card number
67                pattern = "\\d{4}-\\d{6}-\\d{5}"
68                txt_search.SetPattern(pattern)
69                
70                step = step + 1
71            elif step == 2:
72                # found an AMEX card number
73                print("\nThere is an AMEX card number:\n  " + searchResult.GetMatch())
74                
75                # change mode to find the owner of the credit card; supposedly, the owner's
76                # name proceeds the number
77                mode = txt_search.GetMode()
78                mode |= TextSearch.e_search_up
79                txt_search.SetMode(mode)
80                pattern = "[A-z]++ [A-z]++"
81                txt_search.SetPattern(pattern)
82                step = step + 1
83            elif step == 3:
84                # found the owner's name of the AMEX card
85                print("Is the owner's name:\n  " + searchResult.GetMatch() + "?")
86                
87                # add a link annotation based on the location of the found instance
88                hlts = searchResult.GetHighlights()
89                hlts.Begin(doc)
90                
91                while (hlts.HasNext()):
92                    cur_page = doc.GetPage(hlts.GetCurrentPageNumber())
93                    quadsInfo = hlts.GetCurrentQuads()
94                    
95                    i = 0
96                    while i < len(quadsInfo):
97                        q = quadsInfo[i]
98                        # assume each quad is an axis-aligned rectangle                        
99                        x1 = min(min(min(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
100                        x2 = max(max(max(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
101                        y1 = min(min(min(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
102                        y2 = max(max(max(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
103                        hyper_link = Link.Create(doc.GetSDFDoc(), Rect(x1, y1, x2, y2), Action.CreateURI(doc.GetSDFDoc(), "http://www.pdftron.com"))
104                        cur_page.AnnotPushBack(hyper_link)
105                        i = i + 1                    
106                    hlts.Next()
107                doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc.e_linearized)
108                break
109        elif code == TextSearch.e_page:
110            pass
111        else:
112            break
113        
114    doc.Close()
115    PDFNet.Terminate()
116        
117if __name__ == '__main__':
118    main()

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12# This sample illustrates the basic text search capabilities of PDFNet.
13
14# Relative path to the folder containing the test files.
15input_path = "../../TestFiles/"
16output_path = "../../TestFiles/Output/"
17
18	# Initialize PDFNet
19	PDFNet.Initialize(PDFTronLicense.Key)
20	doc = PDFDoc.new(input_path + "credit card numbers.pdf")
21	doc.InitSecurityHandler
22	
23	txt_search = TextSearch.new
24	mode = TextSearch::E_whole_word | TextSearch::E_page_stop
25	
26	pattern = "joHn sMiTh"
27	
28	# call Begin method to initialize the text search.
29	txt_search.Begin(doc, pattern, mode)
30
31	step = 0
32
33	# call Run method iteratively to find all matching instances.
34	while true do
35		searchResult = txt_search.Run
36		if searchResult.IsFound
37			case step
38			when 0
39				# step 0: found "John Smith"
40				# note that, here, 'ambient_string' and 'hlts' are not written to, 
41				# as 'e_ambient_string' and 'e_highlight' are not set.
42				
43				puts searchResult.GetMatch + "'s credit card number is: "
44				
45				# now switch to using regular expressions to find John's credit card number
46				mode = txt_search.GetMode
47				mode |= TextSearch::E_reg_expression | TextSearch::E_highlight
48				txt_search.SetMode(mode)
49				pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"	 #or "(\\d{4}-){3}\\d{4}"
50				txt_search.SetPattern(pattern)
51				step = step + 1
52			when 1
53				# step 1: found John's credit card number
54				puts "  " + searchResult.GetMatch
55				
56				# note that, here, 'hlts' is written to, as 'e_highligh' has been set.
57				# output the highlight info of the credit card number
58				hlts = searchResult.GetHighlights
59				hlts.Begin(doc)
60				while hlts.HasNext
61					puts "The current highlight is from page: " + hlts.GetCurrentPageNumber.to_s
62					hlts.Next
63				end
64					
65				# see if there is an AMEX card number
66				pattern = "\\d{4}-\\d{6}-\\d{5}"
67				txt_search.SetPattern(pattern)
68				
69				step = step + 1
70			when 2
71				# found an AMEX card number
72				puts "\nThere is an AMEX card number:\n  " + searchResult.GetMatch
73				
74				# change mode to find the owner of the credit card; supposedly, the owner's
75				# name proceeds the number
76				mode = txt_search.GetMode
77				mode |= TextSearch::E_search_up
78				txt_search.SetMode(mode)
79				pattern = "[A-z]++ [A-z]++"
80				txt_search.SetPattern(pattern)
81				step = step + 1
82			when 3
83				# found the owner's name of the AMEX card
84				puts "Is the owner's name:\n  " + searchResult.GetMatch + "?"
85				
86				# add a link annotation based on the location of the found instance
87				hlts = searchResult.GetHighlights
88				hlts.Begin(doc)
89				
90				while hlts.HasNext do
91					cur_page = doc.GetPage(hlts.GetCurrentPageNumber)
92					quadsInfo = hlts.GetCurrentQuads
93
94					i = 0
95					while i < quadsInfo.size do
96						q = quadsInfo[i]
97						# assume each quad is an axis-aligned rectangle						
98						x1 = [q.p1.x, q.p2.x, q.p3.x, q.p4.x].min
99						x2 = [q.p1.x, q.p2.x, q.p3.x, q.p4.x].max
100						y1 = [q.p1.y, q.p2.y, q.p3.y, q.p4.y].min
101						y2 = [q.p1.y, q.p2.y, q.p3.y, q.p4.y].max
102						hyper_link = Link.Create(doc.GetSDFDoc, Rect.new(x1, y1, x2, y2), Action.CreateURI(doc.GetSDFDoc, "http://www.pdftron.com"))
103						cur_page.AnnotPushBack(hyper_link)
104						i = i + 1
105					end			
106					hlts.Next
107				end
108				doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc::E_linearized)
109				break
110			end
111		elsif code == TextSearch::E_page
112		else
113			break
114		end
115	end	
116	doc.Close
117	PDFNet.Terminate

1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports System
6Imports pdftron
7Imports pdftron.Common
8Imports pdftron.Filters
9Imports pdftron.SDF
10Imports pdftron.PDF
11
12Module TextSearchTestVB
13    Dim pdfNetLoader As PDFNetLoader
14    Sub New()
15        pdfNetLoader = pdftron.PDFNetLoader.Instance()
16    End Sub
17
18    Sub Main()
19        PDFNet.Initialize(PDFTronLicense.Key)
20        Dim input_path As String = "../../../../TestFiles/"
21
22        Try
23
24            Using doc As PDFDoc = New PDFDoc(input_path & "credit card numbers.pdf")
25                doc.InitSecurityHandler()
26                Dim page_num As Int32 = 0
27                Dim result_str As String = "", ambient_string As String = ""
28                Dim hlts As Highlights = New Highlights()
29                Dim txt_search As TextSearch = New TextSearch()
30                Dim mode As Int32 = CInt((TextSearch.SearchMode.e_whole_word Or TextSearch.SearchMode.e_page_stop Or TextSearch.SearchMode.e_highlight))
31                Dim pattern As String = "joHn sMiTh"
32                txt_search.Begin(doc, pattern, mode, -1, -1)
33                Dim step_ As Integer = 0
34
35                While True
36                    Dim code As TextSearch.ResultCode = txt_search.Run(page_num, result_str, ambient_string, hlts)
37
38                    If code = TextSearch.ResultCode.e_found Then
39
40                        If step_ = 0 Then
41                            Console.WriteLine(result_str & "'s credit card number is: ")
42                            mode = txt_search.GetMode()
43                            mode = mode Or CInt((TextSearch.SearchMode.e_reg_expression Or TextSearch.SearchMode.e_highlight))
44                            txt_search.SetMode(mode)
45                            pattern = "\d{4}-\d{4}-\d{4}-\d{4}"
46                            txt_search.SetPattern(pattern)
47                            step_ += 1
48                        ElseIf step_ = 1 Then
49                            Console.WriteLine("  " & result_str)
50                            hlts.Begin(doc)
51
52                            While hlts.HasNext()
53                                Console.WriteLine("The current highlight is from page: " & hlts.GetCurrentPageNumber())
54                                hlts.Next()
55                            End While
56
57                            pattern = "\d{4}-\d{6}-\d{5}"
58                            txt_search.SetPattern(pattern)
59                            step_ += 1
60                        ElseIf step_ = 2 Then
61                            Console.WriteLine(vbLf & "There is an AMEX card number:" & vbLf & "  " & result_str)
62                            mode = txt_search.GetMode()
63                            mode = mode Or CInt((TextSearch.SearchMode.e_search_up))
64                            txt_search.SetMode(mode)
65                            pattern = "[A-z]++ [A-z]++"
66                            txt_search.SetPattern(pattern)
67                            step_ += 1
68                        ElseIf step_ = 3 Then
69                            Console.WriteLine("Is the owner's name:" & vbLf & "  " & result_str & "?")
70                            hlts.Begin(doc)
71                            While hlts.HasNext()
72                                Dim cur_page As Page = doc.GetPage(hlts.GetCurrentPageNumber())
73                                Dim quads As Double() = hlts.GetCurrentQuads()
74                                Dim quad_count As Integer = quads.Length / 8
75
76                                For i As Integer = 0 To quad_count - 1
77                                    Dim offset As Integer = 8 * i
78                                    Dim x1 As Double = Math.Min(Math.Min(Math.Min(quads(offset + 0), quads(offset + 2)), quads(offset + 4)), quads(offset + 6))
79                                    Dim x2 As Double = Math.Max(Math.Max(Math.Max(quads(offset + 0), quads(offset + 2)), quads(offset + 4)), quads(offset + 6))
80                                    Dim y1 As Double = Math.Min(Math.Min(Math.Min(quads(offset + 1), quads(offset + 3)), quads(offset + 5)), quads(offset + 7))
81                                    Dim y2 As Double = Math.Max(Math.Max(Math.Max(quads(offset + 1), quads(offset + 3)), quads(offset + 5)), quads(offset + 7))
82                                    Dim hyper_link As pdftron.PDF.Annots.Link = pdftron.PDF.Annots.Link.Create(doc, New Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc, "http://www.pdftron.com"))
83                                    hyper_link.RefreshAppearance()
84                                    cur_page.AnnotPushBack(hyper_link)
85                                Next
86
87                                hlts.Next()
88                            End While
89
90                            Dim output_path As String = "../../../../TestFiles/Output/"
91                            doc.Save(output_path & "credit card numbers_linked.pdf", SDFDoc.SaveOptions.e_linearized)
92                            Exit While
93                        End If
94                    ElseIf code = TextSearch.ResultCode.e_page Then
95                    Else
96                        Exit While
97                    End If
98                End While
99            End Using
100        Catch e As PDFNetException
101            Console.WriteLine(e.Message)
102        End Try
103        PDFNet.Terminate()
104    End Sub
105End Module

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

Search PDF for Text / String - TextSearch