Sample C# code for using Apryse SDK to search text on PDF pages using regular expressions. The TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations. Learn more about our Xamarin SDK and PDF Indexed Search Library.
1//
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using pdftron;
7using pdftron.Common;
8using pdftron.Filters;
9using pdftron.SDF;
10using pdftron.PDF;
11
12
13using NUnit.Framework;
14
15namespace MiscellaneousSamples
16{
17 // This sample illustrates various text search capabilities of PDFNet.
18
19 [TestFixture]
20 public class TextSearchTest
21 {
22
23 [Test]
24 public static void Sample()
25 {
26
27 // Relative path to the folder containing test files.
28 const string input_path = "TestFiles/";
29
30 // Sample code showing how to use high-level text extraction APIs.
31 try
32 {
33 using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "credit card numbers.pdf")))
34 {
35 doc.InitSecurityHandler();
36
37 Int32 page_num = 0;
38 String result_str = "", ambient_string = "";
39 Highlights hlts = new Highlights();
40
41 TextSearch txt_search = new TextSearch();
42 Int32 mode = (Int32)(TextSearch.SearchMode.e_whole_word | TextSearch.SearchMode.e_page_stop | TextSearch.SearchMode.e_highlight);
43 String pattern = "joHn sMiTh";
44
45 //call Begin() method to initialize the text search.
46 txt_search.Begin( doc, pattern, mode, -1, -1 );
47
48 int step = 0;
49
50 //call Run() method iteratively to find all matching instances.
51 while ( true )
52 {
53 TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts );
54
55 if ( code == TextSearch.ResultCode.e_found )
56 {
57 if ( step == 0 )
58 { //step 0: found "John Smith"
59 //note that, here, 'ambient_string' and 'hlts' are not written to,
60 //as 'e_ambient_string' and 'e_highlight' are not set.
61 Console.WriteLine(result_str + "'s credit card number is: ");
62
63 //now switch to using regular expressions to find John's credit card number
64 mode = txt_search.GetMode();
65 mode |= (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_highlight);
66 txt_search.SetMode(mode);
67 pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
68 txt_search.SetPattern(pattern);
69
70 ++step;
71 }
72 else if ( step == 1 )
73 {
74 //step 1: found John's credit card number
75 //result_str.ConvertToAscii(char_buf, 32, true);
76 //cout << " " << char_buf << endl;
77 Console.WriteLine(" " + result_str);
78
79 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
80 //output the highlight info of the credit card number
81 hlts.Begin(doc);
82 while (hlts.HasNext())
83 {
84 Console.WriteLine("The current highlight is from page: " + hlts.GetCurrentPageNumber());
85 hlts.Next();
86 }
87
88 //see if there is an AMEX card number
89 pattern = "\\d{4}-\\d{6}-\\d{5}";
90 txt_search.SetPattern(pattern);
91
92 ++step;
93 }
94 else if ( step == 2 )
95 {
96 //found an AMEX card number
97 Console.WriteLine("\nThere is an AMEX card number:\n " + result_str);
98
99 //change mode to find the owner of the credit card; supposedly, the owner's
100 //name proceeds the number
101 mode = txt_search.GetMode();
102 mode |= (Int32)(TextSearch.SearchMode.e_search_up);
103 txt_search.SetMode(mode);
104 pattern = "[A-z]++ [A-z]++";
105 txt_search.SetPattern(pattern);
106
107 ++step;
108 }
109 else if ( step == 3 )
110 {
111 //found the owner's name of the AMEX card
112 Console.WriteLine("Is the owner's name:\n " + result_str + "?");
113
114 //add a link annotation based on the location of the found instance
115 hlts.Begin(doc);
116 while (hlts.HasNext())
117 {
118 Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
119 double[] quads = hlts.GetCurrentQuads();
120 int quad_count = quads.Length / 8;
121 for (int i = 0; i < quad_count; ++i)
122 {
123 //assume each quad is an axis-aligned rectangle
124 int offset = 8 * i;
125 double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
126 double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
127 double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
128 double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
129
130 pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.Create(doc, new Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc, "http://www.pdftron.com"));
131 hyper_link.RefreshAppearance();
132 cur_page.AnnotPushBack(hyper_link);
133 }
134 hlts.Next();
135 }
136 doc.Save(Utils.CreateExternalFile("credit card numbers_linked.pdf"), SDFDoc.SaveOptions.e_linearized);
137
138 break;
139 }
140 }
141 else if ( code == TextSearch.ResultCode.e_page )
142 {
143 //you can update your UI here, if needed
144 }
145 else
146 {
147 break;
148 }
149 }
150 }
151 }
152
153 catch (PDFNetException e)
154 {
155 Console.WriteLine(e.Message);
156 Assert.True(false);
157 }
158 }
159 }
160}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales