Sample JavaScript code for using Apryse SDK to search text on PDF pages using regular expressions. The TextSearch utility class builds on functionality available in Text Extractor Sample Code to simplify most common search operations.
Learn more about our Web SDK and PDF Indexed Search Library.
This sample works with Full-API for WebViewer.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6(exports => {
7  exports.runTextSearchTest = () => {
8    const PDFNet = exports.Core.PDFNet;
9
10    const main = async () => {
11      // Relative path to the folder containing test files.
12      const inputURL = '../TestFiles/';
13      const inputFilename = 'credit card numbers.pdf'; // addimage.pdf, newsletter.pdf
14
15      try {
16        const doc = await PDFNet.PDFDoc.createFromURL(inputURL + inputFilename);
17        doc.initSecurityHandler();
18        doc.lock();
19
20        const txtSearch = await PDFNet.TextSearch.create();
21        let mode = PDFNet.TextSearch.Mode.e_whole_word + PDFNet.TextSearch.Mode.e_page_stop; // Uses both whole word and page stop
22        let pattern = 'joHn sMiTh';
23
24        txtSearch.begin(doc, pattern, mode); // searches for the "pattern" in the document while following the inputted modes.
25
26        let step = 0;
27
28        // call Run() iteratively to find all matching instances of the word 'joHn sMiTh'
29        /* eslint-disable-next-line no-constant-condition */
30        while (true) {
31          const result = await txtSearch.run();
32          let hlts;
33          if (result.code === PDFNet.TextSearch.ResultCode.e_found) {
34            if (step === 0) {
35              // Step 0: found "John Smith"
36              // note that, here, 'ambient_str' and 'highlights' are not written to,
37              // as 'e_ambient_string' and 'e_highlight' are not set.
38              console.log(result.out_str + "'s credit card number is: ");
39
40              // now switch to using regular expressions to find John's credit card number
41              mode = await txtSearch.getMode();
42              mode += PDFNet.TextSearch.Mode.e_reg_expression + PDFNet.TextSearch.Mode.e_highlight;
43              txtSearch.setMode(mode);
44              pattern = '\\d{4}-\\d{4}-\\d{4}-\\d{4}'; // or "(\\d{4}-){3}\\d{4}"
45              txtSearch.setPattern(pattern);
46
47              ++step;
48            } else if (step === 1) {
49              // step 1: found John's credit card number
50              console.log('  ' + result.out_str);
51              // note that, here, 'hlts' is written to, as 'e_highlight' has been set.
52              // output the highlight info of the credit card number.
53              hlts = result.highlights;
54              hlts.begin(doc);
55              while (await hlts.hasNext()) {
56                const highlightPageNum = await hlts.getCurrentPageNumber();
57                console.log('The current highlight is from page: ' + highlightPageNum);
58                await hlts.next();
59              }
60              // see if there is an AMEX card number
61              pattern = '\\d{4}-\\d{6}-\\d{5}';
62              txtSearch.setPattern(pattern);
63
64              ++step;
65            } else if (step === 2) {
66              // found an AMEX card number
67              console.log('\nThere is an AMEX card number:\n  ' + result.out_str);
68
69              // change mode to find the owner of the credit card; supposedly, the owner's
70              // name proceeds the number
71              mode = await txtSearch.getMode();
72              mode += PDFNet.TextSearch.Mode.e_search_up;
73              txtSearch.setMode(mode);
74              pattern = '[A-z]++ [A-z]++';
75              txtSearch.setPattern(pattern);
76
77              ++step;
78            } else if (step === 3) {
79              // found the owner's name of the AMEX card
80              console.log("Is the owner's name:\n  " + result.out_str + '?');
81
82              // add a link annotation based on the location of the found instance
83              hlts = result.highlights;
84              await hlts.begin(doc); 
85              while (await hlts.hasNext()) {
86                const curPage = await doc.getPage(await hlts.getCurrentPageNumber());
87                const quadArr = await hlts.getCurrentQuads();
88                for (let i = 0; i < quadArr.length; ++i) {
89                  const currQuad = quadArr[i];
90                  const x1 = Math.min(Math.min(Math.min(currQuad.p1x, currQuad.p2x), currQuad.p3x), currQuad.p4x);
91                  const x2 = Math.max(Math.max(Math.max(currQuad.p1x, currQuad.p2x), currQuad.p3x), currQuad.p4x);
92                  const y1 = Math.min(Math.min(Math.min(currQuad.p1y, currQuad.p2y), currQuad.p3y), currQuad.p4y);
93                  const y2 = Math.max(Math.max(Math.max(currQuad.p1y, currQuad.p2y), currQuad.p3y), currQuad.p4y);
94
95                  const hyperLink = await PDFNet.LinkAnnot.create(doc, await PDFNet.Rect.init(x1, y1, x2, y2));
96                  await hyperLink.setAction(await PDFNet.Action.createURI(doc, 'http://www.apryse.com'));
97                  await curPage.annotPushBack(hyperLink);
98                }
99                hlts.next();
100              }
101              const docBuffer = await doc.saveMemoryBuffer(PDFNet.SDFDoc.SaveOptions.e_linearized);
102              saveBufferAsPDFDoc(docBuffer, 'credit card numbers_linked.pdf');
103              break;
104            }
105          } else if (result.code === PDFNet.TextSearch.ResultCode.e_page) {
106            // you can update your UI here, if needed
107            console.log('page end');
108          } else if (result.code === PDFNet.TextSearch.ResultCode.e_done) {
109            break;
110          }
111        }
112      } catch (err) {
113        console.log(err);
114      }
115    };
116    // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
117    PDFNet.runWithCleanup(main);
118  };
119})(window);
120// eslint-disable-next-line spaced-comment
121//# sourceURL=TextSearchTest.js
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales