Search PDF for Text / String - TextSearch - C++ Sample Code

Sample code for using Apryse SDK to search text on PDF pages using regular expressions; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. The TextSearch utility class builds on functionality available in TextExtractor Sample to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6// This sample shows how to use pdftron.PDF.TextSearch to search text on PDF pages
7// using regular expressions. TextSearch utility class builds on functionality
8// available in TextExtractor to simplify most common search operations.
9
10#include <PDF/PDFNet.h>
11#include <PDF/PDFDoc.h>
12#include <PDF/TextSearch.h>
13#include <PDF/Annot.h>
14#include <iostream>
15#include "../../LicenseKey/CPP/LicenseKey.h"
16
17using namespace std;
18using namespace pdftron;
19using namespace PDF;
20using namespace SDF;
21using namespace Common;
22
23#undef max
24#undef min
25#include <algorithm>
26
27int main(int argc, char *argv[])
28{
29 int ret = 0;
30 PDFNet::Initialize(LicenseKey);
31 std::string input_path = "../../TestFiles/credit card numbers.pdf";
32 const char* filein = argc>1 ? argv[1] : input_path.c_str();
33
34 try
35 {
36 PDFDoc doc(filein);
37 doc.InitSecurityHandler();
38
39 TextSearch txt_search;
40 TextSearch::Mode mode = TextSearch::e_whole_word | TextSearch::e_page_stop;
41 UString pattern( "joHn sMiTh" );
42
43 //call Begin() method to initialize the text search.
44 txt_search.Begin( doc, pattern, mode );
45
46 int step = 0;
47
48 //call Run() method iteratively to find all matching instances.
49 while ( true )
50 {
51 SearchResult result = txt_search.Run();
52
53 if ( result )
54 {
55 if ( step == 0 )
56 { // Step 0: found "John Smith"
57 // note that, here, 'ambient_string' and 'hlts' are not written to,
58 // as 'e_ambient_string' and 'e_highlight' are not set.
59
60 cout << result.GetMatch() << "'s credit card number is: " << endl;
61
62 //now switch to using regular expressions to find John's credit card number
63 mode = txt_search.GetMode();
64 mode |= TextSearch::e_reg_expression | TextSearch::e_highlight;
65 txt_search.SetMode(mode);
66 pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
67 txt_search.SetPattern(pattern);
68
69 ++step;
70 }
71 else if ( step == 1 )
72 {
73 //step 1: found John's credit card number
74 cout << " " << result.GetMatch() << endl;
75
76 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
77 //output the highlight info of the credit card number.
78 Highlights hlts = result.GetHighlights();
79 hlts.Begin(doc);
80 while ( hlts.HasNext() )
81 {
82 cout << "The current highlight is from page: " << hlts.GetCurrentPageNumber() << endl;
83 hlts.Next();
84 }
85
86 //see if there is an AMEX card number
87 pattern = "\\d{4}-\\d{6}-\\d{5}";
88 txt_search.SetPattern(pattern);
89
90 ++step;
91 }
92 else if ( step == 2 )
93 {
94 //found an AMEX card number
95 cout << "\nThere is an AMEX card number:\n " << result.GetMatch() << endl;
96
97 //change mode to find the owner of the credit card; supposedly, the owner's
98 //name proceeds the number
99 mode = txt_search.GetMode();
100 mode |= TextSearch::e_search_up;
101 txt_search.SetMode(mode);
102 pattern = "[A-z]++ [A-z]++";
103 txt_search.SetPattern(pattern);
104
105 ++step;
106 }
107 else if ( step == 3 )
108 {
109 //found the owner's name of the AMEX card
110 cout << "Is the owner's name:\n " << result.GetMatch() << "?\n" << flush;
111
112 //add a link annotation based on the location of the found instance
113 Highlights hlts = result.GetHighlights();
114 hlts.Begin(doc);
115 while ( hlts.HasNext() )
116 {
117 Page cur_page= doc.GetPage(hlts.GetCurrentPageNumber());
118 const double *quads;
119 int quad_count = hlts.GetCurrentQuads(quads);
120 for ( int i = 0; i < quad_count; ++i )
121 {
122 //assume each quad is an axis-aligned rectangle
123 const double *q = &quads[8*i];
124 double x1 = min(min(min(q[0], q[2]), q[4]), q[6]);
125 double x2 = max(max(max(q[0], q[2]), q[4]), q[6]);
126 double y1 = min(min(min(q[1], q[3]), q[5]), q[7]);
127 double y2 = max(max(max(q[1], q[3]), q[5]), q[7]);
128 Annots::Link hyper_link = Annots::Link::Create(doc, Rect(x1, y1, x2, y2), Action::CreateURI(doc, "http://www.pdftron.com"));
129 cur_page.AnnotPushBack(hyper_link);
130 }
131 hlts.Next();
132 }
133 std::string output_path = "../../TestFiles/Output/";
134 doc.Save((output_path + "credit card numbers_linked.pdf").c_str(), SDFDoc::e_linearized, 0);
135 break;
136 }
137 }
138 else if ( result.IsPageEnd() )
139 {
140 //you can update your UI here, if needed
141 }
142 else
143 {
144 assert (result.IsDocEnd());
145 break;
146 }
147 }
148 }
149 catch(Exception& e)
150 {
151 cout << e << endl;
152 ret = 1;
153 }
154 catch(...)
155 {
156 cout << "Unknown Exception" << endl;
157 ret = 1;
158 }
159
160 PDFNet::Terminate();
161 return ret;
162}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales