Product:

Get started

Release notes

Viewer

Basic operations

Learn more

Annotation

MS Office

Generate via template

Conversion

Smart Data Extraction

Augmenting LLMs with Smart Data Extraction

PDF/A

Accessibility

Forms

Create

Page manipulation

PDF Editing

OCR

Digital signature

Overview

Text search

Search & replace

Samples

APIs

Comparison

Bookmark

Optimization

Layer (OCG)

Redaction

Security

Portfolio

Low-level PDF API

Changelogs

Search for text in a PDF on Server/Desktop

To search for text in a PDF using regular expression and then apply a link annotation on the highlighted result.

In this example, we add a link annotation but any other types of annotations can be applied here such as redaction annotations in the case of a search and redact workflow.

1PDFDoc doc = new PDFDoc(filename);
2Int32 page_num = 0;
3String result_str = "", ambient_string = "";
4Highlights hlts = new Highlights();
5TextSearch txt_search = new TextSearch();
6Int32 mode = (Int32)(TextSearch.SearchMode.e_whole_word | TextSearch.SearchMode.e_page_stop | TextSearch.SearchMode.e_highlight);
7String pattern = "";
8
9//use regular expression to find credit card number
10mode |= (Int32)(TextSearch.SearchMode.e_reg_expression | TextSearch.SearchMode.e_highlight);
11txt_search.SetMode(mode);
12String pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
13txt_search.SetPattern(pattern);
14
15//call Begin() method to initialize the text search.
16txt_search.Begin( doc, pattern, mode, -1, -1 );
17TextSearch.ResultCode code = txt_search.Run(ref page_num, ref result_str, ref ambient_string, hlts );
18
19if ( code == TextSearch.ResultCode.e_found )
20{
21  //add a link annotation based on the location of the found instance
22  hlts.Begin(doc);
23  while (hlts.HasNext())
24  {
25    Page cur_page = doc.GetPage(hlts.GetCurrentPageNumber());
26    double[] quads = hlts.GetCurrentQuads();
27    int quad_count = quads.Length / 8;
28    for (int i = 0; i < quad_count; ++i)
29    {
30      //assume each quad is an axis-aligned rectangle
31      int offset = 8 * i;
32      double x1 = Math.Min(Math.Min(Math.Min(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
33      double x2 = Math.Max(Math.Max(Math.Max(quads[offset + 0], quads[offset + 2]), quads[offset + 4]), quads[offset + 6]);
34      double y1 = Math.Min(Math.Min(Math.Min(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
35      double y2 = Math.Max(Math.Max(Math.Max(quads[offset + 1], quads[offset + 3]), quads[offset + 5]), quads[offset + 7]);
36
37      Annots.Link hyper_link = Annots.Link.Create(doc, new Rect(x1, y1, x2, y2), Action.CreateURI(doc, "http://www.apryse.com"));
38      hyper_link.RefreshAppearance();
39      cur_page.AnnotPushBack(hyper_link);
40    }
41    hlts.Next();
42  }
43}

1PDFDoc doc(filename);
2TextSearch txt_search;
3TextSearch::Mode mode = TextSearch::e_whole_word | TextSearch::e_page_stop;
4UString pattern("");
5
6//use regular expression to find credit card number
7mode |= TextSearch::e_reg_expression | TextSearch::e_highlight;
8txt_search.SetMode(mode);
9pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
10txt_search.SetPattern(pattern);
11
12//call Begin() method to initialize the text search.
13txt_search.Begin( doc, pattern, mode );
14SearchResult result = txt_search.Run();
15
16if ( result )
17{
18  //add a link annotation based on the location of the found instance
19  Highlights hlts = result.GetHighlights();
20  hlts.Begin(doc);
21  while ( hlts.HasNext() )
22  {
23    Page cur_page= doc.GetPage(hlts.GetCurrentPageNumber());
24    const double *quads;
25    int quad_count = hlts.GetCurrentQuads(quads);
26    for ( int i = 0; i < quad_count; ++i )
27    {
28      //assume each quad is an axis-aligned rectangle
29      const double *q = &quads[8*i];
30      double x1 = min(min(min(q[0], q[2]), q[4]), q[6]);
31      double x2 = max(max(max(q[0], q[2]), q[4]), q[6]);
32      double y1 = min(min(min(q[1], q[3]), q[5]), q[7]);
33      double y2 = max(max(max(q[1], q[3]), q[5]), q[7]);
34      Annots::Link hyper_link = Annots::Link::Create(doc, Rect(x1, y1, x2, y2), Action::CreateURI(doc, "http://www.apryse.com"));
35      cur_page.AnnotPushBack(hyper_link);
36    }
37    hlts.Next();
38  }
39}

1doc := NewPDFDoc(filename)
2txtSearch := NewTextSearch()
3mode := TextSearchE_whole_word | TextSearchE_page_stop
4pattern := ""
5
6mode = mode | TextSearchE_reg_expression | TextSearchE_highlight
7txtSearch.SetMode(uint(mode))
8pattern := "\\d{4}-\\d{4}-\\d{4}-\\d{4}"     //or "(\\d{4}-){3}\\d{4}"
9txtSearch.SetPattern(pattern)
10
11// call Begin() method to initialize the text search.
12txtSearch.Begin(doc, pattern, uint(mode))
13searchResult := txtSearch.Run()
14
15if searchResult.IsFound(){
16  // add a link annotation based on the location of the found instance
17  hlts := searchResult.GetHighlights()
18  hlts.Begin(doc)
19  
20  for hlts.HasNext(){
21    curPage := doc.GetPage(uint(hlts.GetCurrentPageNumber()))
22    quadsInfo := hlts.GetCurrentQuads()
23    
24    i := 0
25    for i < int(quadsInfo.Size()){
26      q := quadsInfo.Get(i)
27      // assume each quad is an axis-aligned rectangle 
28      x1 := Min(Min(Min(q.GetP1().GetX(), q.GetP2().GetX()), q.GetP3().GetX()), q.GetP4().GetX())
29      x2 := Max(Max(Max(q.GetP1().GetX(), q.GetP2().GetX()), q.GetP3().GetX()), q.GetP4().GetX())
30      y1 := Min(Min(Min(q.GetP1().GetY(), q.GetP2().GetY()), q.GetP3().GetY()), q.GetP4().GetY())
31      y2 := Max(Max(Max(q.GetP1().GetY(), q.GetP2().GetY()), q.GetP3().GetY()), q.GetP4().GetY())
32      hyperLink := LinkCreate(doc.GetSDFDoc(), NewRect(x1, y1, x2, y2), ActionCreateURI(doc.GetSDFDoc(), "http://www.apryse.com"))
33      curPage.AnnotPushBack(hyperLink)
34      i = i + 1
35    }
36    hlts.Next()
37  }
38}

1PDFDoc doc = new PDFDoc(filename);
2TextSearch txt_search = new TextSearch();
3int mode = TextSearch.e_whole_word | TextSearch.e_page_stop;
4String pattern = "";
5
6//use regular expression to find credit card number
7mode |= TextSearch.e_reg_expression | TextSearch.e_highlight;
8txt_search.setMode(mode);
9String new_pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
10txt_search.setPattern(new_pattern);
11
12//call Begin() method to initialize the text search.
13txt_search.begin(doc, pattern, mode, -1, -1);
14TextSearchResult result = txt_search.run();
15
16if (result.getCode() == TextSearchResult.e_found) {
17  //add a link annotation based on the location of the found instance
18  Highlights hlts = result.getHighlights();
19  hlts.begin(doc);
20  while (hlts.hasNext()) {
21    Page cur_page = doc.getPage(hlts.getCurrentPageNumber());
22    double[] q = hlts.getCurrentQuads();
23    int quad_count = q.length / 8;
24    for (int i = 0; i < quad_count; ++i) {
25      //assume each quad is an axis-aligned rectangle
26      int offset = 8 * i;
27      double x1 = Math.min(Math.min(Math.min(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
28      double x2 = Math.max(Math.max(Math.max(q[offset + 0], q[offset + 2]), q[offset + 4]), q[offset + 6]);
29      double y1 = Math.min(Math.min(Math.min(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
30      double y2 = Math.max(Math.max(Math.max(q[offset + 1], q[offset + 3]), q[offset + 5]), q[offset + 7]);
31      annots.Link hyper_link = annots.Link.create(doc, new Rect(x1, y1, x2, y2), Action.createURI(doc, "http://www.apryse.com"));
32      cur_page.annotPushBack(hyper_link);
33    }
34    hlts.next();
35  }
36}

1async function main() {
2  const doc = await PDFNet.PDFDoc.createFromURL(filename);
3  const txtSearch = await PDFNet.TextSearch.create();
4  let mode = PDFNet.TextSearch.Mode.e_whole_word + PDFNet.TextSearch.Mode.e_page_stop; // Uses both whole word and page stop
5  let pattern = '';
6
7  //use regular expression to find credit card number
8  mode += PDFNet.TextSearch.Mode.e_reg_expression + PDFNet.TextSearch.Mode.e_highlight;
9  txtSearch.setMode(mode);
10  pattern = '\\d{4}-\\d{4}-\\d{4}-\\d{4}'; // or "(\\d{4}-){3}\\d{4}"
11  txtSearch.setPattern(pattern);
12
13  //call Begin() method to initialize the text search.
14  txtSearch.begin(doc, pattern, mode);
15  const result = await txtSearch.run();
16
17  if (result.code === PDFNet.TextSearch.ResultCode.e_found) {
18    // add a link annotation based on the location of the found instance
19    hlts = result.highlights;
20    await hlts.begin(doc); // is await needed?
21    while (await hlts.hasNext()) {
22      const curPage = await doc.getPage(await hlts.getCurrentPageNumber());
23      const quadArr = await hlts.getCurrentQuads();
24      for (let i = 0; i < quadArr.length; ++i) {
25        const currQuad = quadArr[i];
26        const x1 = Math.min(Math.min(Math.min(currQuad.p1x, currQuad.p2x), currQuad.p3x), currQuad.p4x);
27        const x2 = Math.max(Math.max(Math.max(currQuad.p1x, currQuad.p2x), currQuad.p3x), currQuad.p4x);
28        const y1 = Math.min(Math.min(Math.min(currQuad.p1y, currQuad.p2y), currQuad.p3y), currQuad.p4y);
29        const y2 = Math.max(Math.max(Math.max(currQuad.p1y, currQuad.p2y), currQuad.p3y), currQuad.p4y);
30
31        const hyperLink = await PDFNet.LinkAnnot.create(doc, await PDFNet.Rect.init(x1, y1, x2, y2));
32        await hyperLink.setAction(await PDFNet.Action.createURI(doc, 'http://www.apryse.com'));
33        await curPage.annotPushBack(hyperLink);
34      }
35      hlts.next();
36    }
37  }
38}
39PDFNet.runWithCleanup(main);

1PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: filename];
2PTTextSearch *txt_search = [[PTTextSearch alloc] init];
3unsigned int mode = e_ptwhole_word | e_ptpage_stop;
4NSString *pattern = @"";
5
6//use regular expression to find credit card number
7mode |= e_ptreg_expression | e_pthighlight;
8[txt_search SetMode: mode];
9pattern = @"\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
10[txt_search SetPattern: pattern];
11
12//call Begin() method to initialize the text search.
13[txt_search Begin: doc pattern: pattern mode: mode start_page: -1 end_page: -1];
14PTSearchResult *result = [txt_search Run];
15
16if ( result ) 
17{
18  //add a link annotation based on the location of the found instance
19  PTHighlights *hlts = [result GetHighlights];
20  [hlts Begin: doc];
21  while ( [hlts HasNext] )
22  {
23    PTPage *cur_page = [doc GetPage: [hlts GetCurrentPageNumber]];
24    PTVectorQuadPoint *quads = [hlts GetCurrentQuads];
25    int i = 0;
26    for ( ; i < [quads size]; ++i )
27    {
28      //assume each quad is an axis-aligned rectangle
29      PTQuadPoint *q = [quads get: i];
30      double x1 = MIN(MIN(MIN([[q getP1] getX], [[q getP2] getX]), [[q getP3] getX]), [[q getP4] getX]);
31      double x2 = MAX(MAX(MAX([[q getP1] getX], [[q getP2] getX]), [[q getP3] getX]), [[q getP4] getX]);
32      double y1 = MIN(MIN(MIN([[q getP1] getY], [[q getP2] getY]), [[q getP3] getY]), [[q getP4] getY]);
33      double y2 = MAX(MAX(MAX([[q getP1] getY], [[q getP2] getY]), [[q getP3] getY]), [[q getP4] getY]);
34      PTPDFRect * rect = [[PTPDFRect alloc] initWithX1: x1 y1: y1 x2: x2 y2: y2];
35      PTAction *action = [PTAction CreateURI: [doc GetSDFDoc] uri: @"http://www.apryse.com"];
36
37      PTLink *hyper_link = [PTLink CreateWithAction: [doc GetSDFDoc] pos: rect action: action];
38      [cur_page AnnotPushBack: hyper_link];
39    }
40    [hlts Next];
41  }
42}

1$doc = new PDFDoc($filename);
2$txt_search = new TextSearch();
3$mode = TextSearch::e_whole_word | TextSearch::e_page_stop;
4$pattern = "";
5
6//use regular expression to find credit card number
7$mode |= TextSearch::e_reg_expression | TextSearch::e_highlight;
8$txt_search->SetMode($mode);
9$pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"; //or "(\\d{4}-){3}\\d{4}"
10$txt_search->SetPattern($pattern);
11
12//call Begin() method to initialize the text search.
13$txt_search->Begin( $doc, $pattern, $mode );
14$searchResult = $txt_search->Run();
15
16if ( $searchResult->IsFound() )
17{
18  //add a link annotation based on the location of the found instance
19  $hlts = $searchResult->GetHighlights();
20  $hlts->Begin($doc);
21  while ( $hlts->HasNext() )
22  {
23    $cur_page= $doc->GetPage($hlts->GetCurrentPageNumber());
24    $quadsInfo = $hlts->GetCurrentQuads();
25
26    for ( $i = 0; $i < $quadsInfo->size(); ++$i )
27    {
28      //assume each quad is an axis-aligned rectangle
29      $q = $quadsInfo->get($i);
30      $x1 = min(min(min($q->p1->x, $q->p2->x), $q->p3->x), $q->p4->x);
31      $x2 = max(max(max($q->p1->x, $q->p2->x), $q->p3->x), $q->p4->x);
32      $y1 = min(min(min($q->p1->y, $q->p2->y), $q->p3->y), $q->p4->y);
33      $y2 = max(max(max($q->p1->y, $q->p2->y), $q->p3->y), $q->p4->y);
34      $hyper_link = Link::Create($doc->GetSDFDoc(), new Rect($x1, $y1, $x2, $y2), Action::CreateURI($doc->GetSDFDoc(), "http://www.apryse.com"));
35      $cur_page->AnnotPushBack($hyper_link);
36    }
37    $hlts->Next();
38  }
39}

1doc = PDFDoc(filename)
2txt_search = TextSearch()
3mode = TextSearch.e_whole_word | TextSearch.e_page_stop
4pattern = ""
5
6# use regular expression to find credit card number
7mode |= TextSearch.e_reg_expression | TextSearch.e_highlight
8txt_search.SetMode(mode)
9pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"     #or "(\\d{4}-){3}\\d{4}"
10txt_search.SetPattern(pattern)
11
12# call Begin() method to initialize the text search.
13txt_search.Begin(doc, pattern, mode)
14searchResult = txt_search.Run()
15
16if searchResult.IsFound():
17  # add a link annotation based on the location of the found instance
18  hlts = searchResult.GetHighlights()
19  hlts.Begin(doc)
20  
21  while (hlts.HasNext()):
22    cur_page = doc.GetPage(hlts.GetCurrentPageNumber())
23    quadsInfo = hlts.GetCurrentQuads()
24    
25    i = 0
26    while i < len(quadsInfo):
27      q = quadsInfo[i]
28      # assume each quad is an axis-aligned rectangle                        
29      x1 = min(min(min(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
30      x2 = max(max(max(q.p1.x, q.p2.x), q.p3.x), q.p4.x)
31      y1 = min(min(min(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
32      y2 = max(max(max(q.p1.y, q.p2.y), q.p3.y), q.p4.y)
33      hyper_link = Link.Create(doc.GetSDFDoc(), Rect(x1, y1, x2, y2), Action.CreateURI(doc.GetSDFDoc(), "http://www.apryse.com"))
34      cur_page.AnnotPushBack(hyper_link)
35      i = i + 1                    
36    hlts.Next()

1doc = PDFDoc.new(filename)
2txt_search = TextSearch.new
3mode = TextSearch::E_whole_word | TextSearch::E_page_stop
4pattern = ""
5
6# use regular expression to find credit card number
7mode |= TextSearch::E_reg_expression | TextSearch::E_highlight
8txt_search.SetMode(mode)
9pattern = "\\d{4}-\\d{4}-\\d{4}-\\d{4}"	 #or "(\\d{4}-){3}\\d{4}"
10txt_search.SetPattern(pattern)
11
12# call Begin method to initialize the text search.
13txt_search.Begin(doc, pattern, mode)
14searchResult = txt_search.Run
15
16if searchResult.IsFound
17  # add a link annotation based on the location of the found instance
18  hlts = searchResult.GetHighlights
19  hlts.Begin(doc)
20  
21  while hlts.HasNext do
22    cur_page = doc.GetPage(hlts.GetCurrentPageNumber)
23    quadsInfo = hlts.GetCurrentQuads
24
25    i = 0
26    while i < quadsInfo.size do
27      q = quadsInfo[i]
28      # assume each quad is an axis-aligned rectangle						
29      x1 = [q.p1.x, q.p2.x, q.p3.x, q.p4.x].min
30      x2 = [q.p1.x, q.p2.x, q.p3.x, q.p4.x].max
31      y1 = [q.p1.y, q.p2.y, q.p3.y, q.p4.y].min
32      y2 = [q.p1.y, q.p2.y, q.p3.y, q.p4.y].max
33      hyper_link = Link.Create(doc.GetSDFDoc, Rect.new(x1, y1, x2, y2), Action.CreateURI(doc.GetSDFDoc, "http://www.apryse.com"))
34      cur_page.AnnotPushBack(hyper_link)
35      i = i + 1
36    end			
37    hlts.Next
38  end
39end

1Dim doc As PDFDoc = New PDFDoc(filename)
2Dim page_num As Int32 = 0
3Dim result_str As String = "", ambient_string As String = ""
4Dim hlts As Highlights = New Highlights()
5Dim txt_search As TextSearch = New TextSearch()
6Dim mode As Int32 = CInt((TextSearch.SearchMode.e_whole_word Or TextSearch.SearchMode.e_page_stop Or TextSearch.SearchMode.e_highlight))
7Dim pattern As String = ""
8
9' use regular expression to find credit card number
10mode = mode Or CInt((TextSearch.SearchMode.e_reg_expression Or TextSearch.SearchMode.e_highlight))
11txt_search.SetMode(mode)
12pattern = "\d{4}-\d{4}-\d{4}-\d{4}"
13txt_search.SetPattern(pattern)
14
15' call Begin method to initialize the text search.
16txt_search.Begin(doc, pattern, mode, -1, -1)
17Dim code As TextSearch.ResultCode = txt_search.Run(page_num, result_str, ambient_string, hlts)
18
19If code = TextSearch.ResultCode.e_found Then
20  ' add a link annotation based on the location of the found instance
21  hlts.Begin(doc)
22  While hlts.HasNext()
23    Dim cur_page As Page = doc.GetPage(hlts.GetCurrentPageNumber())
24    Dim quads As Double() = hlts.GetCurrentQuads()
25    Dim quad_count As Integer = quads.Length / 8
26
27    For i As Integer = 0 To quad_count - 1
28      Dim offset As Integer = 8 * i
29      Dim x1 As Double = Math.Min(Math.Min(Math.Min(quads(offset + 0), quads(offset + 2)), quads(offset + 4)), quads(offset + 6))
30      Dim x2 As Double = Math.Max(Math.Max(Math.Max(quads(offset + 0), quads(offset + 2)), quads(offset + 4)), quads(offset + 6))
31      Dim y1 As Double = Math.Min(Math.Min(Math.Min(quads(offset + 1), quads(offset + 3)), quads(offset + 5)), quads(offset + 7))
32      Dim y2 As Double = Math.Max(Math.Max(Math.Max(quads(offset + 1), quads(offset + 3)), quads(offset + 5)), quads(offset + 7))
33      Dim hyper_link As pdftron.PDF.Annots.Link = pdftron.PDF.Annots.Link.Create(doc, New Rect(x1, y1, x2, y2), pdftron.PDF.Action.CreateURI(doc, "http://www.apryse.com"))
34      hyper_link.RefreshAppearance()
35      cur_page.AnnotPushBack(hyper_link)
36    Next
37
38    hlts.Next()
39  End While
40End If

Search PDF files for text - Full Sample
Full code sample which shows how to use TextSearch to search text on PDF pages using regular expressions.

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

Product:

Search for text in a PDF on Server/Desktop

Related Links

Related Links

Related Links

Related Links