Some test text!

Search
Hamburger Icon

Python / Guides / Text search

Search for text in a PDF in Python

To search for text in a PDF using regular expression and then apply a link annotation on the highlighted result.

In this example, we add a link annotation but any other types of annotations can be applied here such as redaction annotations in the case of a search and redact workflow.
doc = PDFDoc(filename)
txt_search = TextSearch()
mode = TextSearch.e_whole_word | TextSearch.e_page_stop
pattern = ""

# use regular expression to find credit card number
mode |= TextSearch.e_reg_expression | TextSearch.e_highlight
txt_search.SetMode(mode)
pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"     #or "(\\d{4}-){3}\\d{4}"
txt_search.SetPattern(pattern)

# call Begin() method to initialize the text search.
txt_search.Begin(doc, pattern, mode)
searchResult = txt_search.Run()

if searchResult.IsFound():
  # add a link annotation based on the location of the found instance
  hlts = searchResult.GetHighlights()
  hlts.Begin(doc)
  
  while (hlts.HasNext()):
    cur_page = doc.GetPage(hlts.GetCurrentPageNumber())
    quadsInfo = hlts.GetCurrentQuads()
    
    i = 0
    while i < len(quadsInfo):
      q = quadsInfo[i]
      # assume each quad is an axis-aligned rectangle                        
      x1 = min(min(min(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
      x2 = max(max(max(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
      y1 = min(min(min(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
      y2 = max(max(max(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
      hyper_link = Link.Create(doc.GetSDFDoc(), Rect(x1, y1, x2, y2), Action.CreateURI(doc.GetSDFDoc(), "http://www.pdftron.com"))
      cur_page.AnnotPushBack(hyper_link)
      i = i + 1                    
    hlts.Next()

Search PDF files for text
Full code sample which shows how to use TextSearch to search text on PDF pages using regular expressions.

Get the answers you need: Chat with us