TextSearch

Sample Java code for using Apryse SDK to search text on PDF pages using regular expressions. The TextSearch utility class builds on functionality available in TextExtractor to simplify most common search operations. Learn more about our Android SDK and PDF Indexed Search Library.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.PDFNetException;
13import com.pdftron.pdf.Action;
14import com.pdftron.pdf.Highlights;
15import com.pdftron.pdf.PDFDoc;
16import com.pdftron.pdf.Page;
17import com.pdftron.pdf.Rect;
18import com.pdftron.pdf.TextSearch;
19import com.pdftron.pdf.TextSearchResult;
20import com.pdftron.sdf.SDFDoc;
21
22import java.util.ArrayList;
23
24public class TextSearchTest extends PDFNetSample {
25
26	private static OutputListener mOutputListener;
27
28	private static ArrayList<String> mFileList = new ArrayList<>();
29
30    public TextSearchTest() {
31        setTitle(R.string.sample_textsearch_title);
32        setDescription(R.string.sample_textsearch_description);
33    }
34
35	@Override
36	public void run(OutputListener outputListener) {
37		super.run(outputListener);
38		mOutputListener = outputListener;
39		mFileList.clear();
40		printHeader(outputListener);
41
42        try (PDFDoc doc = new PDFDoc(Utils.getAssetTempFile(INPUT_PATH + "credit card numbers.pdf").getAbsolutePath())) {
43            doc.initSecurityHandler();
44
45            TextSearch txt_search = new TextSearch();
46            int mode = TextSearch.e_whole_word | TextSearch.e_page_stop;
47
48            String pattern = "joHn sMiTh";
49
50            //PDFDoc doesn't allow simultaneous access from different threads. If this
51            //document could be used from other threads (e.g., the rendering thread inside
52            //PDFView/PDFViewCtrl, if used), it is good practice to lock it.
53            //Notice: don't forget to call doc.Unlock() to avoid deadlock.
54            doc.lock();
55
56            //call Begin() method to initialize the text search.
57            txt_search.begin(doc, pattern, mode, -1, -1);
58
59            int step = 0;
60
61            //call Run() method iteratively to find all matching instances.
62            while (true) {
63                TextSearchResult result = txt_search.run();
64
65                if (result.getCode() == TextSearchResult.e_found) {
66                    if (step == 0) {
67                        //step 0: found "John Smith"
68                        //note that, here, 'ambient_string' and 'hlts' are not written to,
69                        //as 'e_ambient_string' and 'e_highlight' are not set.
70                        mOutputListener.println(result.getResultStr() + "'s credit card number is: ");
71
72                        //now switch to using regular expressions to find John's credit card number
73                        mode = txt_search.getMode();
74                        mode |= TextSearch.e_reg_expression | TextSearch.e_highlight;
75                        txt_search.setMode(mode);
76                        String new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
77                        txt_search.setPattern(new_pattern);
78
79                        step = step + 1;
80                    } else if (step == 1) {
81                        //step 1: found John's credit card number
82                        mOutputListener.println("  " + result.getResultStr());
83
84                        //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
85                        //output the highlight info of the credit card number
86                        Highlights hlts = result.getHighlights();
87                        hlts.begin(doc);
88                        while (hlts.hasNext()) {
89                            mOutputListener.println("The current highlight is from page: " + hlts.getCurrentPageNumber());
90                            hlts.next();
91                        }
92
93                        //see if there is an AMEX card number
94                        String new_pattern = "\\d{4}-\\d{6}-\\d{5}";
95                        txt_search.setPattern(new_pattern);
96
97                        step = step + 1;
98                    } else if (step == 2) {
99                        //found an AMEX card number
100                        mOutputListener.println("\nThere is an AMEX card number:");
101                        mOutputListener.println("  " + result.getResultStr());
102
103                        //change mode to find the owner of the credit card; supposedly, the owner's
104                        //name proceeds the number
105                        mode = txt_search.getMode();
106                        mode |= TextSearch.e_search_up;
107                        txt_search.setMode(mode);
108                        String new_pattern = "[A-z]++ [A-z]++";
109                        txt_search.setPattern(new_pattern);
110
111                        step = step + 1;
112                    } else if (step == 3) {
113                        //found the owner's name of the AMEX card
114                        mOutputListener.println("Is the owner's name:");
115                        mOutputListener.println("  " + result.getResultStr() + "?");
116
117                        //add a link annotation based on the location of the found instance
118                        Highlights hlts = result.getHighlights();
119                        hlts.begin(doc);
120                        while (hlts.hasNext()) {
121                            Page cur_page = doc.getPage(hlts.getCurrentPageNumber());
122                            double[] q = hlts.getCurrentQuads();
123                            int quad_count = q.length / 8;
124                            for (int i = 0; i < quad_count; ++i) {
125                                //assume each quad is an axis-aligned rectangle
126                                int offset = 8 * i;
127                                double x1 = Math.min(Math.min(Math.min(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
128                                double x2 = Math.max(Math.max(Math.max(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
129                                double y1 = Math.min(Math.min(Math.min(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
130                                double y2 = Math.max(Math.max(Math.max(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
131                                com.pdftron.pdf.annots.Link hyper_link = com.pdftron.pdf.annots.Link.create(doc, new Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.pdftron.com"));
132                                cur_page.annotPushBack(hyper_link);
133                            }
134                            hlts.next();
135                        }
136                        doc.save(Utils.createExternalFile("credit card numbers_linked.pdf", mFileList).getAbsolutePath(), SDFDoc.SaveMode.LINEARIZED, null);
137                        break;
138                    }
139                } else if (result.getCode() == TextSearchResult.e_page) {
140                    //you can update your UI here, if needed
141                } else {
142                    break;
143                }
144            }
145
146            doc.unlock();
147        } catch (PDFNetException e) {
148            mOutputListener.printError(e.getStackTrace());
149        }
150
151		for (String file : mFileList) {
152			addToFileList(file);
153		}
154		printFooter(outputListener);
155	}
156
157}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample
10import com.pdftron.android.pdfnetsdksamples.R
11import com.pdftron.android.pdfnetsdksamples.util.Utils
12import com.pdftron.common.PDFNetException
13import com.pdftron.pdf.*
14import com.pdftron.sdf.SDFDoc
15import java.util.*
16
17class TextSearchTest : PDFNetSample() {
18    init {
19        setTitle(R.string.sample_textsearch_title)
20        setDescription(R.string.sample_textsearch_description)
21    }
22
23    override fun run(outputListener: OutputListener?) {
24        super.run(outputListener)
25        mOutputListener = outputListener
26        mFileList.clear()
27        printHeader(outputListener!!)
28
29        try {
30            PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "credit card numbers.pdf")!!.absolutePath).use { doc ->
31                doc.initSecurityHandler()
32
33                val txt_search = TextSearch()
34                var mode = TextSearch.e_whole_word or TextSearch.e_page_stop
35
36                val pattern = "joHn sMiTh"
37
38                //PDFDoc doesn't allow simultaneous access from different threads. If this
39                //document could be used from other threads (e.g., the rendering thread inside
40                //PDFView/PDFViewCtrl, if used), it is good practice to lock it.
41                //Notice: don't forget to call doc.Unlock() to avoid deadlock.
42                doc.lock()
43
44                //call Begin() method to initialize the text search.
45                txt_search.begin(doc, pattern, mode, -1, -1)
46
47                var step = 0
48
49                //call Run() method iteratively to find all matching instances.
50                while (true) {
51                    val result = txt_search.run()
52
53                    if (result.code == TextSearchResult.e_found) {
54                        if (step == 0) {
55                            //step 0: found "John Smith"
56                            //note that, here, 'ambient_string' and 'hlts' are not written to,
57                            //as 'e_ambient_string' and 'e_highlight' are not set.
58                            mOutputListener!!.println(result.resultStr + "'s credit card number is: ")
59
60                            //now switch to using regular expressions to find John's credit card number
61                            mode = txt_search.mode
62                            mode = mode or (TextSearch.e_reg_expression or TextSearch.e_highlight)
63                            txt_search.mode = mode
64                            val new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}" //or "(\\d{4}-){3}\\d{4}"
65                            txt_search.setPattern(new_pattern)
66
67                            step = step + 1
68                        } else if (step == 1) {
69                            //step 1: found John's credit card number
70                            mOutputListener!!.println("  " + result.resultStr)
71
72                            //note that, here, 'hlts' is written to, as 'e_highlight' has been set.
73                            //output the highlight info of the credit card number
74                            val hlts = result.highlights
75                            hlts.begin(doc)
76                            while (hlts.hasNext()) {
77                                mOutputListener!!.println("The current highlight is from page: " + hlts.currentPageNumber)
78                                hlts.next()
79                            }
80
81                            //see if there is an AMEX card number
82                            val new_pattern = "\\d{4}-\\d{6}-\\d{5}"
83                            txt_search.setPattern(new_pattern)
84
85                            step = step + 1
86                        } else if (step == 2) {
87                            //found an AMEX card number
88                            mOutputListener!!.println("\nThere is an AMEX card number:")
89                            mOutputListener!!.println("  " + result.resultStr)
90
91                            //change mode to find the owner of the credit card; supposedly, the owner's
92                            //name proceeds the number
93                            mode = txt_search.mode
94                            mode = mode or TextSearch.e_search_up
95                            txt_search.mode = mode
96                            val new_pattern = "[A-z]++ [A-z]++"
97                            txt_search.setPattern(new_pattern)
98
99                            step = step + 1
100                        } else if (step == 3) {
101                            //found the owner's name of the AMEX card
102                            mOutputListener!!.println("Is the owner's name:")
103                            mOutputListener!!.println("  " + result.resultStr + "?")
104
105                            //add a link annotation based on the location of the found instance
106                            val hlts = result.highlights
107                            hlts.begin(doc)
108                            while (hlts.hasNext()) {
109                                val cur_page = doc.getPage(hlts.currentPageNumber)
110                                val q = hlts.currentQuads
111                                val quad_count = q.size / 8
112                                for (i in 0 until quad_count) {
113                                    //assume each quad is an axis-aligned rectangle
114                                    val offset = 8 * i
115                                    val x1 = Math.min(Math.min(Math.min(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6])
116                                    val x2 = Math.max(Math.max(Math.max(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6])
117                                    val y1 = Math.min(Math.min(Math.min(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7])
118                                    val y2 = Math.max(Math.max(Math.max(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7])
119                                    val hyper_link = com.pdftron.pdf.annots.Link.create(doc, Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.pdftron.com"))
120                                    cur_page.annotPushBack(hyper_link)
121                                }
122                                hlts.next()
123                            }
124                            doc.save(Utils.createExternalFile("credit card numbers_linked.pdf", mFileList).absolutePath, SDFDoc.SaveMode.LINEARIZED, null)
125                            break
126                        }
127                    } else if (result.code == TextSearchResult.e_page) {
128                        //you can update your UI here, if needed
129                    } else {
130                        break
131                    }
132                }
133
134                doc.unlock()
135            }
136        } catch (e: PDFNetException) {
137            mOutputListener!!.printError(e.stackTrace)
138        }
139
140        for (file in mFileList) {
141            addToFileList(file)
142        }
143        printFooter(outputListener)
144    }
145
146    companion object {
147
148        private var mOutputListener: OutputListener? = null
149
150        private val mFileList = ArrayList<String>()
151    }
152
153}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

TextSearch