Sample C# code for using Apryse SDK to search text on PDF pages using regular expressions. The TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations. Learn more about our UWP SDK and PDF Indexed Search Library.
1//
2// Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.IO;
7using System.Threading.Tasks;
8using Windows.Foundation;
9
10using pdftron.Common;
11using pdftron.PDF;
12using pdftron.SDF;
13
14using PDFNetUniversalSamples.ViewModels;
15
16namespace PDFNetSamples
17{
18    public sealed class TextSearchTest : Sample
19    {
20        public TextSearchTest() :
21            base("TextSearch", "This sample shows how to use pdftron.PDF.TextSearch to search text on PDF pages using regular expressions. TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations.")
22        {
23        }
24
25        public override IAsyncAction RunAsync()
26        {
27            return Task.Run(new System.Action(async () => {
28                WriteLine("--------------------------------");
29                WriteLine("Starting TextSearch Test...");
30                WriteLine("--------------------------------\n");
31                try
32                {
33                    string input_file_path = Path.Combine(InputPath, "credit card numbers.pdf");
34                    WriteLine("Opening input file " + input_file_path);
35                    PDFDoc doc = new PDFDoc(input_file_path);
36                    doc.InitSecurityHandler();
37
38                    pdftron.Common.Int32Ref page_num = new pdftron.Common.Int32Ref(0);
39                    //String result_str = "", ambient_string = "";
40                    pdftron.Common.StringRef result_str = new pdftron.Common.StringRef();
41                    pdftron.Common.StringRef ambient_string = new pdftron.Common.StringRef();
42                    Highlights hlts = new Highlights();
43
44                    TextSearch txt_search = new TextSearch();
45                    int mode = (int)(TextSearchSearchMode.e_whole_word) | (int)(TextSearchSearchMode.e_page_stop) | (int)(TextSearchSearchMode.e_highlight);
46                    //String pattern = "joHn sMiTh";
47                    String pattern = "John Smith";
48
49                    //call Begin() method to initialize the text search.
50                    txt_search.Begin(doc, pattern, mode, -1, -1);
51
52                    int step = 0;
53
54                    //call Run() method iteratively to find all matching instances.
55                    while (true)
56                    {
57                        TextSearchResultCode code = txt_search.Run(page_num, result_str, ambient_string, hlts);
58
59                        if (code == TextSearchResultCode.e_found)
60                        {
61                            if (step == 0)
62                            {	//step 0: found "John Smith"
63                                //note that, here, 'ambient_string' and 'hlts' are not written to, 
64                                //as 'e_ambient_string' and 'e_highlight' are not set.
65                                WriteLine(result_str.Value + "'s credit card number is: ");
66
67                                //now switch to using regular expressions to find John's credit card number
68                                mode = txt_search.GetMode();
69                                mode |= (int)(TextSearchSearchMode.e_reg_expression | TextSearchSearchMode.e_highlight);
70                                txt_search.SetMode(mode);
71                                pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
72                                txt_search.SetPattern(pattern);
73
74                                ++step;
75                            }
76                            else if (step == 1)
77                            {
78                                //step 1: found John's credit card number
79                                //result_str.ConvertToAscii(char_buf, 32, true);
80                                //cout << "  " << char_buf << endl;
81                                WriteLine(result_str.Value);
82
83                                //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
84                                //output the highlight info of the credit card number
85                                hlts.Begin(doc);
86                                while (hlts.HasNext())
87                                {
88                                    WriteLine("The current highlight is from page: " + hlts.GetCurrentPageNumber());
89                                    hlts.Next();
90                                }
91
92                                //see if there is an AMEX card number
93                                pattern = "\\d{4}-\\d{6}-\\d{5}";
94                                txt_search.SetPattern(pattern);
95
96                                ++step;
97                            }
98                            else if (step == 2)
99                            {
100                                //found an AMEX card number
101                                WriteLine("There is an AMEX card number:\n  " + result_str.Value);
102
103                                //change mode to find the owner of the credit card; supposedly, the owner's
104                                //name proceeds the number
105                                mode = txt_search.GetMode();
106                                mode |= (int)(TextSearchSearchMode.e_search_up);
107                                txt_search.SetMode(mode);
108                                pattern = "[A-z]+ [A-z]+";
109                                txt_search.SetPattern(pattern);
110
111                                ++step;
112                            }
113                            else if (step == 3)
114                            {
115                                //found the owner's name of the AMEX card
116                                WriteLine("Is the owner's name:\n  " + result_str.Value + "?");
117
118                                //add a link annotation based on the location of the found instance
119                                hlts.Begin(doc);
120                                while (hlts.HasNext())
121                                {
122                                    pdftron.PDF.Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
123                                    double[] quads = hlts.GetCurrentQuads();
124                                    int quad_count = quads.Length / 8;
125                                    for (int i = 0; i < quad_count; ++i)
126                                    {
127                                        //assume each quad is an axis-aligned rectangle
128                                        int offset = 8 * i;
129                                        double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
130                                        double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
131                                        double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
132                                        double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
133
134                                        pdftron.PDF.Annots.Link hyper_link = pdftron.PDF.Annots.Link.Create(doc.GetSDFDoc(), new pdftron.PDF.Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc.GetSDFDoc(), "http://www.pdftron.com"));
135                                        hyper_link.RefreshAppearance();
136                                        cur_page.AnnotPushBack(hyper_link);
137                                    }
138                                    hlts.Next();
139                                }
140                                string output_file_path = Path.Combine(OutputPath, "credit card numbers_linked.pdf");
141                                await doc.SaveAsync(output_file_path, SDFDocSaveOptions.e_linearized);
142                                WriteLine("Done. Results saved in " + output_file_path);
143                                await AddFileToOutputList(output_file_path).ConfigureAwait(false);
144
145                                break;
146                            }
147                        }
148                        else if (code == TextSearchResultCode.e_page)
149                        {
150                            //you can update your UI here, if needed
151                        }
152                        else
153                        {
154                            break;
155                        }
156                    }
157                }
158                catch (Exception e)
159                {
160                    WriteLine(GetExceptionMessage(e));
161                }
162
163                WriteLine("\n--------------------------------");
164                WriteLine("Done TextSearch Test.");
165                WriteLine("--------------------------------\n");
166            })).AsAsyncAction();
167        }
168    }
169}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales