Sample Java code for using Apryse SDK to search text on PDF pages using regular expressions. The TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations. Learn more about our Android SDK and PDF Indexed Search Library.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.PDFNetException;
13import com.pdftron.pdf.Action;
14import com.pdftron.pdf.Highlights;
15import com.pdftron.pdf.PDFDoc;
16import com.pdftron.pdf.Page;
17import com.pdftron.pdf.Rect;
18import com.pdftron.pdf.TextSearch;
19import com.pdftron.pdf.TextSearchResult;
20import com.pdftron.sdf.SDFDoc;
21
22import java.util.ArrayList;
23
24public class TextSearchTest extends PDFNetSample {
25
26 private static OutputListener mOutputListener;
27
28 private static ArrayList<String> mFileList = new ArrayList<>();
29
30 public TextSearchTest() {
31 setTitle(R.string.sample_textsearch_title);
32 setDescription(R.string.sample_textsearch_description);
33 }
34
35 @Override
36 public void run(OutputListener outputListener) {
37 super.run(outputListener);
38 mOutputListener = outputListener;
39 mFileList.clear();
40 printHeader(outputListener);
41
42 try (PDFDoc doc = new PDFDoc(Utils.getAssetTempFile(INPUT_PATH + "credit card numbers.pdf").getAbsolutePath())) {
43 doc.initSecurityHandler();
44
45 TextSearch txt_search = new TextSearch();
46 int mode = TextSearch.e_whole_word | TextSearch.e_page_stop;
47
48 String pattern = "joHn sMiTh";
49
50 //PDFDoc doesn't allow simultaneous access from different threads. If this
51 //document could be used from other threads (e.g., the rendering thread inside
52 //PDFView/PDFViewCtrl, if used), it is good practice to lock it.
53 //Notice: don't forget to call doc.Unlock() to avoid deadlock.
54 doc.lock();
55
56 //call Begin() method to initialize the text search.
57 txt_search.begin(doc, pattern, mode, -1, -1);
58
59 int step = 0;
60
61 //call Run() method iteratively to find all matching instances.
62 while (true) {
63 TextSearchResult result = txt_search.run();
64
65 if (result.getCode() == TextSearchResult.e_found) {
66 if (step == 0) {
67 //step 0: found "John Smith"
68 //note that, here, 'ambient_string' and 'hlts' are not written to,
69 //as 'e_ambient_string' and 'e_highlight' are not set.
70 mOutputListener.println(result.getResultStr() + "'s credit card number is: ");
71
72 //now switch to using regular expressions to find John's credit card number
73 mode = txt_search.getMode();
74 mode |= TextSearch.e_reg_expression | TextSearch.e_highlight;
75 txt_search.setMode(mode);
76 String new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
77 txt_search.setPattern(new_pattern);
78
79 step = step + 1;
80 } else if (step == 1) {
81 //step 1: found John's credit card number
82 mOutputListener.println(" " + result.getResultStr());
83
84 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
85 //output the highlight info of the credit card number
86 Highlights hlts = result.getHighlights();
87 hlts.begin(doc);
88 while (hlts.hasNext()) {
89 mOutputListener.println("The current highlight is from page: " + hlts.getCurrentPageNumber());
90 hlts.next();
91 }
92
93 //see if there is an AMEX card number
94 String new_pattern = "\\d{4}-\\d{6}-\\d{5}";
95 txt_search.setPattern(new_pattern);
96
97 step = step + 1;
98 } else if (step == 2) {
99 //found an AMEX card number
100 mOutputListener.println("\nThere is an AMEX card number:");
101 mOutputListener.println(" " + result.getResultStr());
102
103 //change mode to find the owner of the credit card; supposedly, the owner's
104 //name proceeds the number
105 mode = txt_search.getMode();
106 mode |= TextSearch.e_search_up;
107 txt_search.setMode(mode);
108 String new_pattern = "[A-z]++ [A-z]++";
109 txt_search.setPattern(new_pattern);
110
111 step = step + 1;
112 } else if (step == 3) {
113 //found the owner's name of the AMEX card
114 mOutputListener.println("Is the owner's name:");
115 mOutputListener.println(" " + result.getResultStr() + "?");
116
117 //add a link annotation based on the location of the found instance
118 Highlights hlts = result.getHighlights();
119 hlts.begin(doc);
120 while (hlts.hasNext()) {
121 Page cur_page = doc.getPage(hlts.getCurrentPageNumber());
122 double[] q = hlts.getCurrentQuads();
123 int quad_count = q.length / 8;
124 for (int i = 0; i < quad_count; ++i) {
125 //assume each quad is an axis-aligned rectangle
126 int offset = 8 * i;
127 double x1 = Math.min(Math.min(Math.min(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
128 double x2 = Math.max(Math.max(Math.max(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
129 double y1 = Math.min(Math.min(Math.min(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
130 double y2 = Math.max(Math.max(Math.max(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
131 com.pdftron.pdf.annots.Link hyper_link = com.pdftron.pdf.annots.Link.create(doc, new Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.pdftron.com"));
132 cur_page.annotPushBack(hyper_link);
133 }
134 hlts.next();
135 }
136 doc.save(Utils.createExternalFile("credit card numbers_linked.pdf", mFileList).getAbsolutePath(), SDFDoc.SaveMode.LINEARIZED, null);
137 break;
138 }
139 } else if (result.getCode() == TextSearchResult.e_page) {
140 //you can update your UI here, if needed
141 } else {
142 break;
143 }
144 }
145
146 doc.unlock();
147 } catch (PDFNetException e) {
148 mOutputListener.printError(e.getStackTrace());
149 }
150
151 for (String file : mFileList) {
152 addToFileList(file);
153 }
154 printFooter(outputListener);
155 }
156
157}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample
10import com.pdftron.android.pdfnetsdksamples.R
11import com.pdftron.android.pdfnetsdksamples.util.Utils
12import com.pdftron.common.PDFNetException
13import com.pdftron.pdf.*
14import com.pdftron.sdf.SDFDoc
15import java.util.*
16
17class TextSearchTest : PDFNetSample() {
18 init {
19 setTitle(R.string.sample_textsearch_title)
20 setDescription(R.string.sample_textsearch_description)
21 }
22
23 override fun run(outputListener: OutputListener?) {
24 super.run(outputListener)
25 mOutputListener = outputListener
26 mFileList.clear()
27 printHeader(outputListener!!)
28
29 try {
30 PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "credit card numbers.pdf")!!.absolutePath).use { doc ->
31 doc.initSecurityHandler()
32
33 val txt_search = TextSearch()
34 var mode = TextSearch.e_whole_word or TextSearch.e_page_stop
35
36 val pattern = "joHn sMiTh"
37
38 //PDFDoc doesn't allow simultaneous access from different threads. If this
39 //document could be used from other threads (e.g., the rendering thread inside
40 //PDFView/PDFViewCtrl, if used), it is good practice to lock it.
41 //Notice: don't forget to call doc.Unlock() to avoid deadlock.
42 doc.lock()
43
44 //call Begin() method to initialize the text search.
45 txt_search.begin(doc, pattern, mode, -1, -1)
46
47 var step = 0
48
49 //call Run() method iteratively to find all matching instances.
50 while (true) {
51 val result = txt_search.run()
52
53 if (result.code == TextSearchResult.e_found) {
54 if (step == 0) {
55 //step 0: found "John Smith"
56 //note that, here, 'ambient_string' and 'hlts' are not written to,
57 //as 'e_ambient_string' and 'e_highlight' are not set.
58 mOutputListener!!.println(result.resultStr + "'s credit card number is: ")
59
60 //now switch to using regular expressions to find John's credit card number
61 mode = txt_search.mode
62 mode = mode or (TextSearch.e_reg_expression or TextSearch.e_highlight)
63 txt_search.mode = mode
64 val new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}" //or "(\\d{4}-){3}\\d{4}"
65 txt_search.setPattern(new_pattern)
66
67 step = step + 1
68 } else if (step == 1) {
69 //step 1: found John's credit card number
70 mOutputListener!!.println(" " + result.resultStr)
71
72 //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
73 //output the highlight info of the credit card number
74 val hlts = result.highlights
75 hlts.begin(doc)
76 while (hlts.hasNext()) {
77 mOutputListener!!.println("The current highlight is from page: " + hlts.currentPageNumber)
78 hlts.next()
79 }
80
81 //see if there is an AMEX card number
82 val new_pattern = "\\d{4}-\\d{6}-\\d{5}"
83 txt_search.setPattern(new_pattern)
84
85 step = step + 1
86 } else if (step == 2) {
87 //found an AMEX card number
88 mOutputListener!!.println("\nThere is an AMEX card number:")
89 mOutputListener!!.println(" " + result.resultStr)
90
91 //change mode to find the owner of the credit card; supposedly, the owner's
92 //name proceeds the number
93 mode = txt_search.mode
94 mode = mode or TextSearch.e_search_up
95 txt_search.mode = mode
96 val new_pattern = "[A-z]++ [A-z]++"
97 txt_search.setPattern(new_pattern)
98
99 step = step + 1
100 } else if (step == 3) {
101 //found the owner's name of the AMEX card
102 mOutputListener!!.println("Is the owner's name:")
103 mOutputListener!!.println(" " + result.resultStr + "?")
104
105 //add a link annotation based on the location of the found instance
106 val hlts = result.highlights
107 hlts.begin(doc)
108 while (hlts.hasNext()) {
109 val cur_page = doc.getPage(hlts.currentPageNumber)
110 val q = hlts.currentQuads
111 val quad_count = q.size / 8
112 for (i in 0 until quad_count) {
113 //assume each quad is an axis-aligned rectangle
114 val offset = 8 * i
115 val x1 = Math.min(Math.min(Math.min(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6])
116 val x2 = Math.max(Math.max(Math.max(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6])
117 val y1 = Math.min(Math.min(Math.min(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7])
118 val y2 = Math.max(Math.max(Math.max(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7])
119 val hyper_link = com.pdftron.pdf.annots.Link.create(doc, Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.pdftron.com"))
120 cur_page.annotPushBack(hyper_link)
121 }
122 hlts.next()
123 }
124 doc.save(Utils.createExternalFile("credit card numbers_linked.pdf", mFileList).absolutePath, SDFDoc.SaveMode.LINEARIZED, null)
125 break
126 }
127 } else if (result.code == TextSearchResult.e_page) {
128 //you can update your UI here, if needed
129 } else {
130 break
131 }
132 }
133
134 doc.unlock()
135 }
136 } catch (e: PDFNetException) {
137 mOutputListener!!.printError(e.stackTrace)
138 }
139
140 for (file in mFileList) {
141 addToFileList(file)
142 }
143 printFooter(outputListener)
144 }
145
146 companion object {
147
148 private var mOutputListener: OutputListener? = null
149
150 private val mFileList = ArrayList<String>()
151 }
152
153}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales