Search PDF for Text / String - TextSearch - Ruby Sample Code

Sample code for using Apryse SDK to search text on PDF pages using regular expressions; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. The TextSearch utility class builds on functionality available in TextExtractor Sample to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12# This sample illustrates the basic text search capabilities of PDFNet.
13
14# Relative path to the folder containing the test files.
15input_path = "../../TestFiles/"
16output_path = "../../TestFiles/Output/"
17
18 # Initialize PDFNet
19 PDFNet.Initialize(PDFTronLicense.Key)
20 doc = PDFDoc.new(input_path + "credit card numbers.pdf")
21 doc.InitSecurityHandler
22
23 txt_search = TextSearch.new
24 mode = TextSearch::E_whole_word | TextSearch::E_page_stop
25
26 pattern = "joHn sMiTh"
27
28 # call Begin method to initialize the text search.
29 txt_search.Begin(doc, pattern, mode)
30
31 step = 0
32
33 # call Run method iteratively to find all matching instances.
34 while true do
35 searchResult = txt_search.Run
36 if searchResult.IsFound
37 case step
38 when 0
39 # step 0: found "John Smith"
40 # note that, here, 'ambient_string' and 'hlts' are not written to,
41 # as 'e_ambient_string' and 'e_highlight' are not set.
42
43 puts searchResult.GetMatch + "'s credit card number is: "
44
45 # now switch to using regular expressions to find John's credit card number
46 mode = txt_search.GetMode
47 mode |= TextSearch::E_reg_expression | TextSearch::E_highlight
48 txt_search.SetMode(mode)
49 pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}" #or "(\\d{4}-){3}\\d{4}"
50 txt_search.SetPattern(pattern)
51 step = step + 1
52 when 1
53 # step 1: found John's credit card number
54 puts " " + searchResult.GetMatch
55
56 # note that, here, 'hlts' is written to, as 'e_highligh' has been set.
57 # output the highlight info of the credit card number
58 hlts = searchResult.GetHighlights
59 hlts.Begin(doc)
60 while hlts.HasNext
61 puts "The current highlight is from page: " + hlts.GetCurrentPageNumber.to_s
62 hlts.Next
63 end
64
65 # see if there is an AMEX card number
66 pattern = "\\d{4}-\\d{6}-\\d{5}"
67 txt_search.SetPattern(pattern)
68
69 step = step + 1
70 when 2
71 # found an AMEX card number
72 puts "\nThere is an AMEX card number:\n " + searchResult.GetMatch
73
74 # change mode to find the owner of the credit card; supposedly, the owner's
75 # name proceeds the number
76 mode = txt_search.GetMode
77 mode |= TextSearch::E_search_up
78 txt_search.SetMode(mode)
79 pattern = "[A-z]++ [A-z]++"
80 txt_search.SetPattern(pattern)
81 step = step + 1
82 when 3
83 # found the owner's name of the AMEX card
84 puts "Is the owner's name:\n " + searchResult.GetMatch + "?"
85
86 # add a link annotation based on the location of the found instance
87 hlts = searchResult.GetHighlights
88 hlts.Begin(doc)
89
90 while hlts.HasNext do
91 cur_page = doc.GetPage(hlts.GetCurrentPageNumber)
92 quadsInfo = hlts.GetCurrentQuads
93
94 i = 0
95 while i < quadsInfo.size do
96 q = quadsInfo[i]
97 # assume each quad is an axis-aligned rectangle
98 x1 = [q.p1.x, q.p2.x, q.p3.x, q.p4.x].min
99 x2 = [q.p1.x, q.p2.x, q.p3.x, q.p4.x].max
100 y1 = [q.p1.y, q.p2.y, q.p3.y, q.p4.y].min
101 y2 = [q.p1.y, q.p2.y, q.p3.y, q.p4.y].max
102 hyper_link = Link.Create(doc.GetSDFDoc, Rect.new(x1, y1, x2, y2), Action.CreateURI(doc.GetSDFDoc, "http://www.pdftron.com"))
103 cur_page.AnnotPushBack(hyper_link)
104 i = i + 1
105 end
106 hlts.Next
107 end
108 doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc::E_linearized)
109 break
110 end
111 elsif code == TextSearch::E_page
112 else
113 break
114 end
115 end
116 doc.Close
117 PDFNet.Terminate

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales