Search PDF for Text / String - TextSearch - Python Sample Code

Sample code for using Apryse SDK to search text on PDF pages using regular expressions; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. The TextSearch utility class builds on functionality available in TextExtractor Sample to simplify most common search operations. Learn more about our Server SDK and PDF Indexed Search Library.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14# This sample illustrates the basic text search capabilities of PDFNet.
15
16# Relative path to the folder containing the test files.
17input_path = "../../TestFiles/"
18output_path = "../../TestFiles/Output/"
19
20def main():
21 # Initialize PDFNet
22 PDFNet.Initialize(LicenseKey)
23 doc = PDFDoc(input_path + "credit card numbers.pdf")
24 doc.InitSecurityHandler()
25
26 txt_search = TextSearch()
27 mode = TextSearch.e_whole_word | TextSearch.e_page_stop
28
29 pattern = "joHn sMiTh"
30
31 # call Begin() method to initialize the text search.
32 txt_search.Begin(doc, pattern, mode)
33
34 step = 0
35
36 # call Run() method iteratively to find all matching instances.
37 while True:
38 searchResult = txt_search.Run()
39 if searchResult.IsFound():
40 if step == 0:
41 # step 0: found "John Smith"
42 # note that, here, 'ambient_string' and 'hlts' are not written to,
43 # as 'e_ambient_string' and 'e_highlight' are not set.
44
45 print(str(searchResult.GetMatch()) + "'s credit card number is: ")
46
47 # now switch to using regular expressions to find John's credit card number
48 mode = txt_search.GetMode()
49 mode |= TextSearch.e_reg_expression | TextSearch.e_highlight
50 txt_search.SetMode(mode)
51 pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}" #or "(\\d{4}-){3}\\d{4}"
52 txt_search.SetPattern(pattern)
53 step = step + 1
54 elif step == 1:
55 # step 1: found John's credit card number
56 print(" " + searchResult.GetMatch())
57
58 # note that, here, 'hlts' is written to, as 'e_highligh' has been set.
59 # output the highlight info of the credit card number
60 hlts = searchResult.GetHighlights()
61 hlts.Begin(doc)
62 while hlts.HasNext():
63 print("The current highlight is from page: " + str(hlts.GetCurrentPageNumber()))
64 hlts.Next()
65
66 # see if there is an AMEX card number
67 pattern = "\\d{4}-\\d{6}-\\d{5}"
68 txt_search.SetPattern(pattern)
69
70 step = step + 1
71 elif step == 2:
72 # found an AMEX card number
73 print("\nThere is an AMEX card number:\n " + searchResult.GetMatch())
74
75 # change mode to find the owner of the credit card; supposedly, the owner's
76 # name proceeds the number
77 mode = txt_search.GetMode()
78 mode |= TextSearch.e_search_up
79 txt_search.SetMode(mode)
80 pattern = "[A-z]++ [A-z]++"
81 txt_search.SetPattern(pattern)
82 step = step + 1
83 elif step == 3:
84 # found the owner's name of the AMEX card
85 print("Is the owner's name:\n " + searchResult.GetMatch() + "?")
86
87 # add a link annotation based on the location of the found instance
88 hlts = searchResult.GetHighlights()
89 hlts.Begin(doc)
90
91 while (hlts.HasNext()):
92 cur_page = doc.GetPage(hlts.GetCurrentPageNumber())
93 quadsInfo = hlts.GetCurrentQuads()
94
95 i = 0
96 while i < len(quadsInfo):
97 q = quadsInfo[i]
98 # assume each quad is an axis-aligned rectangle
99 x1 = min(min(min(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
100 x2 = max(max(max(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
101 y1 = min(min(min(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
102 y2 = max(max(max(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
103 hyper_link = Link.Create(doc.GetSDFDoc(), Rect(x1, y1, x2, y2), Action.CreateURI(doc.GetSDFDoc(), "http://www.pdftron.com"))
104 cur_page.AnnotPushBack(hyper_link)
105 i = i + 1
106 hlts.Next()
107 doc.Save(output_path + "credit card numbers_linked.pdf", SDFDoc.e_linearized)
108 break
109 elif code == TextSearch.e_page:
110 pass
111 else:
112 break
113
114 doc.Close()
115 PDFNet.Terminate()
116
117if __name__ == '__main__':
118 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales