TextSearch

Sample C# code for using Apryse SDK to search text on PDF pages using regular expressions. The TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using pdftron;
7using pdftron.Common;
8using pdftron.Filters;
9using pdftron.SDF;
10using pdftron.PDF;
11
12
13namespace TextSearchTestCS
14{
15 // This sample illustrates various text search capabilities of PDFNet.
16
17 class Class1
18 {
19 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
20 static Class1() {}
21
22 static void Main(string[] args)
23 {
24 PDFNet.Initialize(PDFTronLicense.Key);
25
26 // Relative path to the folder containing test files.
27 string input_path = "../../../../TestFiles/";
28
29 // Sample code showing how to use high-level text extraction APIs.
30 try
31 {
32 using (PDFDoc doc = new PDFDoc(input_path + "credit card numbers.pdf"))
33 {
34 doc.InitSecurityHandler();
35
36 Int32 page_num = 0;
37 String result_str = "", ambient_string = "";
38 Highlights hlts = new Highlights();
39
40 TextSearch txt_search = new TextSearch();
41 Int32 mode = (Int32)(TextSearch.SearchMode.e_whole_word | TextSearch.SearchMode.e_page_stop | TextSearch.SearchMode.e_highlight);
42 String pattern = "joHn sMiTh";
43
44 //call Begin() method to initialize the text search.
45 txt_search.Begin( doc, pattern, mode, -1, -1 );
46
47 int step = 0;
48
49 //call Run() method iteratively to find all matching instances.
50 while ( true )
51 {
52 TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts );
53
54 if ( code == TextSearch.ResultCode.e_found )
55 {
56 if ( step == 0 )
57 { //step 0: found "John Smith"
58 //note that, here, 'ambient_string' and 'hlts' are not written to,
59 //as 'e_ambient_string' and 'e_highlight' are not set.
60 Console.WriteLine(result_str + "'s credit card number is: ");
61
62 //now switch to using regular expressions to find John's credit card number
63 mode = txt_search.GetMode();
64 mode |= (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_highlight);
65 txt_search.SetMode(mode);
66 pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
67 txt_search.SetPattern(pattern);
68
69 ++step;
70 }
71 else if ( step == 1 )
72 {
73 //step 1: found John's credit card number
74 //result_str.ConvertToAscii(char_buf, 32, true);
75 //cout << " " << char_buf << endl;
76 Console.WriteLine(" " + result_str);
77
78 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
79 //output the highlight info of the credit card number
80 hlts.Begin(doc);
81 while (hlts.HasNext())
82 {
83 Console.WriteLine("The current highlight is from page: " + hlts.GetCurrentPageNumber());
84 hlts.Next();
85 }
86
87 //see if there is an AMEX card number
88 pattern = "\\d{4}-\\d{6}-\\d{5}";
89 txt_search.SetPattern(pattern);
90
91 ++step;
92 }
93 else if ( step == 2 )
94 {
95 //found an AMEX card number
96 Console.WriteLine("\nThere is an AMEX card number:\n " + result_str);
97
98 //change mode to find the owner of the credit card; supposedly, the owner's
99 //name proceeds the number
100 mode = txt_search.GetMode();
101 mode |= (Int32)(TextSearch.SearchMode.e_search_up);
102 txt_search.SetMode(mode);
103 pattern = "[A-z]++ [A-z]++";
104 txt_search.SetPattern(pattern);
105
106 ++step;
107 }
108 else if ( step == 3 )
109 {
110 //found the owner's name of the AMEX card
111 Console.WriteLine("Is the owner's name:\n " + result_str + "?");
112
113 //add a link annotation based on the location of the found instance
114 hlts.Begin(doc);
115 while (hlts.HasNext())
116 {
117 Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
118 double[] quads = hlts.GetCurrentQuads();
119 int quad_count = quads.Length / 8;
120 for (int i = 0; i < quad_count; ++i)
121 {
122 //assume each quad is an axis-aligned rectangle
123 int offset = 8 * i;
124 double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
125 double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
126 double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
127 double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
128
129 pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.Create(doc, new Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc, "http://www.pdftron.com"));
130 hyper_link.RefreshAppearance();
131 cur_page.AnnotPushBack(hyper_link);
132 }
133 hlts.Next();
134 }
135 string output_path = "../../../../TestFiles/Output/";
136 doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc.SaveOptions.e_linearized);
137
138 break;
139 }
140 }
141 else if ( code == TextSearch.ResultCode.e_page )
142 {
143 //you can update your UI here, if needed
144 }
145 else
146 {
147 break;
148 }
149 }
150 }
151 }
152
153 catch (PDFNetException e)
154 {
155 Console.WriteLine(e.Message);
156 }
157 PDFNet.Terminate();
158 }
159 }
160}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales