TextSearch

Sample code for using Apryse SDK to search text on PDF pages using regular expressions; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. The TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using pdftron;
7using pdftron.Common;
8using pdftron.Filters;
9using pdftron.SDF;
10using pdftron.PDF;
11
12
13namespace TextSearchTestCS
14{
15 // This sample illustrates various text search capabilities of PDFNet.
16
17 class Class1
18 {
19 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
20 static Class1() {}
21
22 static void Main(string[] args)
23 {
24 PDFNet.Initialize(PDFTronLicense.Key);
25
26 // Relative path to the folder containing test files.
27 string input_path = "../../../../TestFiles/";
28
29 // Sample code showing how to use high-level text extraction APIs.
30 try
31 {
32 using (PDFDoc doc = new PDFDoc(input_path + "credit card numbers.pdf"))
33 {
34 doc.InitSecurityHandler();
35
36 Int32 page_num = 0;
37 String result_str = "", ambient_string = "";
38 Highlights hlts = new Highlights();
39
40 TextSearch txt_search = new TextSearch();
41 Int32 mode = (Int32)(TextSearch.SearchMode.e_whole_word | TextSearch.SearchMode.e_page_stop | TextSearch.SearchMode.e_highlight);
42 String pattern = "joHn sMiTh";
43
44 //call Begin() method to initialize the text search.
45 txt_search.Begin( doc, pattern, mode, -1, -1 );
46
47 int step = 0;
48
49 //call Run() method iteratively to find all matching instances.
50 while ( true )
51 {
52 TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts );
53
54 if ( code == TextSearch.ResultCode.e_found )
55 {
56 if ( step == 0 )
57 { //step 0: found "John Smith"
58 //note that, here, 'ambient_string' and 'hlts' are not written to,
59 //as 'e_ambient_string' and 'e_highlight' are not set.
60 Console.WriteLine(result_str + "'s credit card number is: ");
61
62 //now switch to using regular expressions to find John's credit card number
63 mode = txt_search.GetMode();
64 mode |= (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_highlight);
65 txt_search.SetMode(mode);
66 pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
67 txt_search.SetPattern(pattern);
68
69 ++step;
70 }
71 else if ( step == 1 )
72 {
73 //step 1: found John's credit card number
74 //result_str.ConvertToAscii(char_buf, 32, true);
75 //cout << " " << char_buf << endl;
76 Console.WriteLine(" " + result_str);
77
78 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
79 //output the highlight info of the credit card number
80 hlts.Begin(doc);
81 while (hlts.HasNext())
82 {
83 Console.WriteLine("The current highlight is from page: " + hlts.GetCurrentPageNumber());
84 hlts.Next();
85 }
86
87 //see if there is an AMEX card number
88 pattern = "\\d{4}-\\d{6}-\\d{5}";
89 txt_search.SetPattern(pattern);
90
91 ++step;
92 }
93 else if ( step == 2 )
94 {
95 //found an AMEX card number
96 Console.WriteLine("\nThere is an AMEX card number:\n " + result_str);
97
98 //change mode to find the owner of the credit card; supposedly, the owner's
99 //name proceeds the number
100 mode = txt_search.GetMode();
101 mode |= (Int32)(TextSearch.SearchMode.e_search_up);
102 txt_search.SetMode(mode);
103 pattern = "[A-z]++ [A-z]++";
104 txt_search.SetPattern(pattern);
105
106 ++step;
107 }
108 else if ( step == 3 )
109 {
110 //found the owner's name of the AMEX card
111 Console.WriteLine("Is the owner's name:\n " + result_str + "?");
112
113 //add a link annotation based on the location of the found instance
114 hlts.Begin(doc);
115 while (hlts.HasNext())
116 {
117 Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
118 double[] quads = hlts.GetCurrentQuads();
119 int quad_count = quads.Length / 8;
120 for (int i = 0; i < quad_count; ++i)
121 {
122 //assume each quad is an axis-aligned rectangle
123 int offset = 8 * i;
124 double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
125 double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
126 double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
127 double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
128
129 pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.Create(doc, new Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc, "http://www.pdftron.com"));
130 hyper_link.RefreshAppearance();
131 cur_page.AnnotPushBack(hyper_link);
132 }
133 hlts.Next();
134 }
135 string output_path = "../../../../TestFiles/Output/";
136 doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc.SaveOptions.e_linearized);
137
138 break;
139 }
140 }
141 else if ( code == TextSearch.ResultCode.e_page )
142 {
143 //you can update your UI here, if needed
144 }
145 else
146 {
147 break;
148 }
149 }
150 }
151 }
152
153 catch (PDFNetException e)
154 {
155 Console.WriteLine(e.Message);
156 }
157 PDFNet.Terminate();
158 }
159 }
160}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales