Search PDF for Text / String - TextSearch - Java Sample Code

Sample code for using Apryse SDK to search text on PDF pages using regular expressions; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. The TextSearch utility class builds on functionality available in TextExtractor Sample to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.common.PDFNetException;
7import com.pdftron.pdf.*;
8import com.pdftron.sdf.SDFDoc;
9
10// This sample illustrates the basic text search capabilities of PDFNet.
11public class TextSearchTest {
12
13 public static void main(String[] args) {
14 PDFNet.initialize(PDFTronLicense.Key());
15 String input_path = "../../TestFiles/";
16
17 try (PDFDoc doc = new PDFDoc(input_path + "credit card numbers.pdf")) {
18 doc.initSecurityHandler();
19
20 TextSearch txt_search = new TextSearch();
21 int mode = TextSearch.e_whole_word | TextSearch.e_page_stop;
22
23 String pattern = "joHn sMiTh";
24
25 //PDFDoc doesn't allow simultaneous access from different threads. If this
26 //document could be used from other threads (e.g., the rendering thread inside
27 //PDFView/PDFViewCtrl, if used), it is good practice to lock it.
28 //Notice: don't forget to call doc.Unlock() to avoid deadlock.
29 doc.lock();
30
31 //call Begin() method to initialize the text search.
32 txt_search.begin(doc, pattern, mode, -1, -1);
33
34 int step = 0;
35
36 //call Run() method iteratively to find all matching instances.
37 while (true) {
38 TextSearchResult result = txt_search.run();
39
40 if (result.getCode() == TextSearchResult.e_found) {
41 if (step == 0) {
42 //step 0: found "John Smith"
43 //note that, here, 'ambient_string' and 'hlts' are not written to,
44 //as 'e_ambient_string' and 'e_highlight' are not set.
45 System.out.println(result.getResultStr() + "'s credit card number is: ");
46
47 //now switch to using regular expressions to find John's credit card number
48 mode = txt_search.getMode();
49 mode |= TextSearch.e_reg_expression | TextSearch.e_highlight;
50 txt_search.setMode(mode);
51 String new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
52 txt_search.setPattern(new_pattern);
53
54 step = step + 1;
55 } else if (step == 1) {
56 //step 1: found John's credit card number
57 System.out.println(" " + result.getResultStr());
58
59 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
60 //output the highlight info of the credit card number
61 Highlights hlts = result.getHighlights();
62 hlts.begin(doc);
63 while (hlts.hasNext()) {
64 System.out.println("The current highlight is from page: " + hlts.getCurrentPageNumber());
65 hlts.next();
66 }
67
68 //see if there is an AMEX card number
69 String new_pattern = "\\d{4}-\\d{6}-\\d{5}";
70 txt_search.setPattern(new_pattern);
71
72 step = step + 1;
73 } else if (step == 2) {
74 //found an AMEX card number
75 System.out.println("\nThere is an AMEX card number:");
76 System.out.println(" " + result.getResultStr());
77
78 //change mode to find the owner of the credit card; supposedly, the owner's
79 //name proceeds the number
80 mode = txt_search.getMode();
81 mode |= TextSearch.e_search_up;
82 txt_search.setMode(mode);
83 String new_pattern = "[A-z]++ [A-z]++";
84 txt_search.setPattern(new_pattern);
85
86 step = step + 1;
87 } else if (step == 3) {
88 //found the owner's name of the AMEX card
89 System.out.println("Is the owner's name:");
90 System.out.println(" " + result.getResultStr() + "?");
91
92 //add a link annotation based on the location of the found instance
93 Highlights hlts = result.getHighlights();
94 hlts.begin(doc);
95 while (hlts.hasNext()) {
96 Page cur_page = doc.getPage(hlts.getCurrentPageNumber());
97 double[] q = hlts.getCurrentQuads();
98 int quad_count = q.length / 8;
99 for (int i = 0; i < quad_count; ++i) {
100 //assume each quad is an axis-aligned rectangle
101 int offset = 8 * i;
102 double x1 = Math.min(Math.min(Math.min(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
103 double x2 = Math.max(Math.max(Math.max(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
104 double y1 = Math.min(Math.min(Math.min(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
105 double y2 = Math.max(Math.max(Math.max(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
106 com.pdftron.pdf.annots.Link hyper_link = com.pdftron.pdf.annots.Link.create(doc, new Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.pdftron.com"));
107 cur_page.annotPushBack(hyper_link);
108 }
109 hlts.next();
110 }
111 String output_path = "../../TestFiles/Output/";
112 doc.save(output_path + "credit card numbers_linked.pdf", SDFDoc.SaveMode.LINEARIZED, null);
113 // output PDF doc
114 break;
115 }
116 } else if (result.getCode() == TextSearchResult.e_page) {
117 //you can update your UI here, if needed
118 } else {
119 break;
120 }
121 }
122
123 doc.unlock();
124 } catch (PDFNetException e) {
125 System.out.println(e);
126 }
127
128 PDFNet.terminate();
129 }
130}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales