Search PDF for Text / String - TextSearch - PHP Sample Code

Sample code for using Apryse SDK to search text on PDF pages using regular expressions; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. The TextSearch utility class builds on functionality available in TextExtractor Sample to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/";
12$output_path = $input_path."Output/";
13
14 PDFNet::Initialize($LicenseKey);
15 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
16
17 $doc = new PDFDoc($input_path."credit card numbers.pdf");
18 $doc->InitSecurityHandler();
19
20 $txt_search = new TextSearch();
21 $mode = TextSearch::e_whole_word | TextSearch::e_page_stop;
22 $pattern = "joHn sMiTh";
23
24 //call Begin() method to initialize the text search.
25 $txt_search->Begin( $doc, $pattern, $mode );
26
27 $step = 0;
28
29 //call Run() method iteratively to find all matching instances.
30 while ( true )
31 {
32 $searchResult = $txt_search->Run();
33 if ( $searchResult->IsFound() )
34 {
35 if ( $step == 0 )
36 { //step 0: found "John Smith"
37 //note that, here, 'ambient_string' and 'hlts' are not written to,
38 //as 'e_ambient_string' and 'e_highlight' are not set.
39
40 echo nl2br($searchResult->GetMatch()."'s credit card number is: \n");
41
42 //now switch to using regular expressions to find John's credit card number
43 $mode = $txt_search->GetMode();
44 $mode |= TextSearch::e_reg_expression | TextSearch::e_highlight;
45 $txt_search->SetMode($mode);
46 $pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
47 $txt_search->SetPattern($pattern);
48
49 ++$step;
50 }
51 else if ( $step == 1 )
52 {
53 //step 1: found John's credit card number
54 echo nl2br(" ".$searchResult->GetMatch()."\n");
55
56 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
57 //output the highlight info of the credit card number.
58 $hlts = $searchResult->GetHighlights();
59 $hlts->Begin($doc);
60 while ( $hlts->HasNext() )
61 {
62 echo nl2br("The current highlight is from page: ".$hlts->GetCurrentPageNumber()."\n");
63 $hlts->Next();
64 }
65
66 //see if there is an AMEX card number
67 $pattern = "\\d{4}-\\d{6}-\\d{5}";
68 $txt_search->SetPattern($pattern);
69
70 ++$step;
71 }
72 else if ( $step == 2 )
73 {
74 //found an AMEX card number
75 echo nl2br("\nThere is an AMEX card number:\n ".$searchResult->GetMatch()."\n");
76
77 //change mode to find the owner of the credit card; supposedly, the owner's
78 //name proceeds the number
79 $mode = $txt_search->GetMode();
80 $mode |= TextSearch::e_search_up;
81 $txt_search->SetMode($mode);
82 $pattern = "[A-z]++ [A-z]++";
83 $txt_search->SetPattern($pattern);
84
85 ++$step;
86 }
87 else if ( $step == 3 )
88 {
89 //found the owner's name of the AMEX card
90 echo nl2br("Is the owner's name:\n ".$searchResult->GetMatch()."?\n");
91
92 //add a link annotation based on the location of the found instance
93 $hlts = $searchResult->GetHighlights();
94 $hlts->Begin($doc);
95 while ( $hlts->HasNext() )
96 {
97 $cur_page= $doc->GetPage($hlts->GetCurrentPageNumber());
98 $quadsInfo = $hlts->GetCurrentQuads();
99
100 for ( $i = 0; $i < $quadsInfo->size(); ++$i )
101 {
102 //assume each quad is an axis-aligned rectangle
103 $q = $quadsInfo->get($i);
104 $x1 = min(min(min($q->p1->x, $q->p2->x), $q->p3->x), $q->p4->x);
105 $x2 = max(max(max($q->p1->x, $q->p2->x), $q->p3->x), $q->p4->x);
106 $y1 = min(min(min($q->p1->y, $q->p2->y), $q->p3->y), $q->p4->y);
107 $y2 = max(max(max($q->p1->y, $q->p2->y), $q->p3->y), $q->p4->y);
108 $hyper_link = Link::CreateAnnot($doc->GetSDFDoc(), new Rect($x1, $y1, $x2, $y2),
109 Action::CreateURI($doc->GetSDFDoc(), "http://www.pdftron.com"));
110 $cur_page->AnnotPushBack($hyper_link);
111 }
112 $hlts->Next();
113 }
114
115 $doc->Save($output_path."credit card numbers_linked.pdf", SDFDoc::e_linearized);
116
117 break;
118 }
119 }
120 else if ( $code == TextSearch::e_page )
121 {
122 //you can update your UI here, if needed
123 }
124 else
125 {
126 break;
127 }
128 }
129
130 $doc->Close();
131 PDFNet::Terminate();
132?>

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales