TextSearch

Sample C# code for using Apryse SDK to search text on PDF pages using regular expressions. The TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations. Learn more about our UWP SDK and PDF Indexed Search Library.

1//
2// Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.IO;
7using System.Threading.Tasks;
8using Windows.Foundation;
9
10using pdftron.Common;
11using pdftron.PDF;
12using pdftron.SDF;
13
14using PDFNetUniversalSamples.ViewModels;
15
16namespace PDFNetSamples
17{
18 public sealed class TextSearchTest : Sample
19 {
20 public TextSearchTest() :
21 base("TextSearch", "This sample shows how to use pdftron.PDF.TextSearch to search text on PDF pages using regular expressions. TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations.")
22 {
23 }
24
25 public override IAsyncAction RunAsync()
26 {
27 return Task.Run(new System.Action(async () => {
28 WriteLine("--------------------------------");
29 WriteLine("Starting TextSearch Test...");
30 WriteLine("--------------------------------\n");
31 try
32 {
33 string input_file_path = Path.Combine(InputPath, "credit card numbers.pdf");
34 WriteLine("Opening input file " + input_file_path);
35 PDFDoc doc = new PDFDoc(input_file_path);
36 doc.InitSecurityHandler();
37
38 pdftron.Common.Int32Ref page_num = new pdftron.Common.Int32Ref(0);
39 //String result_str = "", ambient_string = "";
40 pdftron.Common.StringRef result_str = new pdftron.Common.StringRef();
41 pdftron.Common.StringRef ambient_string = new pdftron.Common.StringRef();
42 Highlights hlts = new Highlights();
43
44 TextSearch txt_search = new TextSearch();
45 int mode = (int)(TextSearchSearchMode.e_whole_word) | (int)(TextSearchSearchMode.e_page_stop) | (int)(TextSearchSearchMode.e_highlight);
46 //String pattern = "joHn sMiTh";
47 String pattern = "John Smith";
48
49 //call Begin() method to initialize the text search.
50 txt_search.Begin(doc, pattern, mode, -1, -1);
51
52 int step = 0;
53
54 //call Run() method iteratively to find all matching instances.
55 while (true)
56 {
57 TextSearchResultCode code = txt_search.Run(page_num, result_str, ambient_string, hlts);
58
59 if (code == TextSearchResultCode.e_found)
60 {
61 if (step == 0)
62 { //step 0: found "John Smith"
63 //note that, here, 'ambient_string' and 'hlts' are not written to,
64 //as 'e_ambient_string' and 'e_highlight' are not set.
65 WriteLine(result_str.Value + "'s credit card number is: ");
66
67 //now switch to using regular expressions to find John's credit card number
68 mode = txt_search.GetMode();
69 mode |= (int)(TextSearchSearchMode.e_reg_expression | TextSearchSearchMode.e_highlight);
70 txt_search.SetMode(mode);
71 pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
72 txt_search.SetPattern(pattern);
73
74 ++step;
75 }
76 else if (step == 1)
77 {
78 //step 1: found John's credit card number
79 //result_str.ConvertToAscii(char_buf, 32, true);
80 //cout << " " << char_buf << endl;
81 WriteLine(result_str.Value);
82
83 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
84 //output the highlight info of the credit card number
85 hlts.Begin(doc);
86 while (hlts.HasNext())
87 {
88 WriteLine("The current highlight is from page: " + hlts.GetCurrentPageNumber());
89 hlts.Next();
90 }
91
92 //see if there is an AMEX card number
93 pattern = "\\d{4}-\\d{6}-\\d{5}";
94 txt_search.SetPattern(pattern);
95
96 ++step;
97 }
98 else if (step == 2)
99 {
100 //found an AMEX card number
101 WriteLine("There is an AMEX card number:\n " + result_str.Value);
102
103 //change mode to find the owner of the credit card; supposedly, the owner's
104 //name proceeds the number
105 mode = txt_search.GetMode();
106 mode |= (int)(TextSearchSearchMode.e_search_up);
107 txt_search.SetMode(mode);
108 pattern = "[A-z]+ [A-z]+";
109 txt_search.SetPattern(pattern);
110
111 ++step;
112 }
113 else if (step == 3)
114 {
115 //found the owner's name of the AMEX card
116 WriteLine("Is the owner's name:\n " + result_str.Value + "?");
117
118 //add a link annotation based on the location of the found instance
119 hlts.Begin(doc);
120 while (hlts.HasNext())
121 {
122 pdftron.PDF.Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
123 double[] quads = hlts.GetCurrentQuads();
124 int quad_count = quads.Length / 8;
125 for (int i = 0; i < quad_count; ++i)
126 {
127 //assume each quad is an axis-aligned rectangle
128 int offset = 8 * i;
129 double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
130 double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
131 double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
132 double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
133
134 pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.Create(doc.GetSDFDoc(), new pdftron.PDF.Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc.GetSDFDoc(), "http://www.pdftron.com"));
135 hyper_link.RefreshAppearance();
136 cur_page.AnnotPushBack(hyper_link);
137 }
138 hlts.Next();
139 }
140 string output_file_path = Path.Combine(OutputPath, "credit card numbers_linked.pdf");
141 await doc.SaveAsync(output_file_path, SDFDocSaveOptions.e_linearized);
142 WriteLine("Done. Results saved in " + output_file_path);
143 await AddFileToOutputList(output_file_path).ConfigureAwait(false);
144
145 break;
146 }
147 }
148 else if (code == TextSearchResultCode.e_page)
149 {
150 //you can update your UI here, if needed
151 }
152 else
153 {
154 break;
155 }
156 }
157 }
158 catch (Exception e)
159 {
160 WriteLine(GetExceptionMessage(e));
161 }
162
163 WriteLine("\n--------------------------------");
164 WriteLine("Done TextSearch Test.");
165 WriteLine("--------------------------------\n");
166 })).AsAsyncAction();
167 }
168 }
169}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales