More languages
Some test text!
More languages
Sample VB code for using PDFTron SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library.
Get Started Samples DownloadTo run this sample, get started with a free trial of Apryse SDK.
'
' Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
'
Imports System
Imports System.Drawing
Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports PDFTRON.PDF
' This sample illustrates various text extraction capabilities of PDFNet.
Module TextExtractTestVB
Dim pdfNetLoader As PDFNetLoader
Sub New()
pdfNetLoader = pdftron.PDFNetLoader.Instance()
End Sub
Sub Main()
PDFNet.Initialize(PDFTronLicense.Key)
' Relative path to the folder containing test files.
Dim input_path As String = "../../../../TestFiles/"
Dim example1_basic As Boolean = False
Dim example2_xml As Boolean = False
Dim example3_wordlist As Boolean = False
Dim example4_advanced As Boolean = True
Dim example5_low_level As Boolean = False
' Sample code showing how to use high-level text extraction APIs.
Try
Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
doc.InitSecurityHandler()
Dim pg As Page = doc.GetPage(1)
If pg Is Nothing Then
Console.WriteLine("Page not found.")
Return
End If
Using txt As TextExtractor = New TextExtractor
txt.Begin(pg) ' Read the page.
' Other options you may want to consider...
' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_no_dup_remove)
' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_remove_hidden_text)
' ...
' Example 1. Get all text on the page in a single string.
' Words will be separated with space or new line characters.
If example1_basic Then
' Get the word count.
Console.WriteLine("Word Count: {0}", txt.GetWordCount())
Console.WriteLine("")
Console.WriteLine("- GetAsText --------------------------")
Console.WriteLine(txt.GetAsText())
Console.WriteLine("-----------------------------------------------------------")
End If
' Example 2. Get XML logical structure for the page.
If example2_xml Then
Console.WriteLine("")
Console.WriteLine("- GetAsXML --------------------------")
Console.WriteLine(txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements Or TextExtractor.XMLOutputFlags.e_output_bbox Or TextExtractor.XMLOutputFlags.e_output_style_info))
Console.WriteLine("-----------------------------------------------------------")
End If
If example3_wordlist Then
Dim word As TextExtractor.Word
Dim line As TextExtractor.Line = txt.GetFirstLine()
While line.IsValid()
word = line.GetFirstWord()
While word.IsValid()
Console.WriteLine(word.GetString())
word = word.GetNextWord()
End While
line = line.GetNextLine()
End While
Console.WriteLine("-----------------------------------------------------------")
End If
' Example 3. A more advanced text extraction example.
' The output is XML structure containing paragraphs, lines, words,
' as well as style and positioning information.
If example4_advanced Then
Dim bbox As Rect
Dim cur_flow_id As Integer = -1
Dim cur_para_id As Integer = -1
Dim line As TextExtractor.Line
Dim word As TextExtractor.Word
Dim s As TextExtractor.Style
Dim line_style As TextExtractor.Style
Console.WriteLine("<PDFText>")
' For each line on the page...
line = txt.GetFirstLine()
While line.IsValid()
If Not cur_flow_id = line.GetFlowID() Then
If Not cur_flow_id = -1 Then
If Not cur_para_id = -1 Then
cur_para_id = -1
Console.WriteLine("</Para>")
End If
Console.WriteLine("</Flow>")
End If
cur_flow_id = line.GetFlowID()
Console.WriteLine("<Flow id=""{0}"">", cur_flow_id)
End If
If Not cur_para_id = line.GetParagraphID() Then
If Not cur_para_id = -1 Then
Console.WriteLine("</Para>")
End If
cur_para_id = line.GetParagraphID()
Console.WriteLine("<Para id=""{0}"">", cur_para_id)
End If
bbox = line.GetBBox()
line_style = line.GetStyle()
Console.Write("<Line box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
PrintStyle(line_style)
Console.Write(" cur_num=""" & line.GetCurrentNum() & """")
Console.WriteLine(">")
' For each word in the line...
word = line.GetFirstWord()
While word.IsValid()
' Output the bounding box for the word.
bbox = word.GetBBox()
Console.Write("<Word box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
Console.Write(" cur_num=""" & word.GetCurrentNum() & """")
Dim sz As Integer = word.GetStringLen()
If (sz = 0) Then Continue While
' If the word style is different from the parent style, output the new style.
s = word.GetStyle()
If Not s.Equals(line_style) Then
PrintStyle(s)
End If
Console.Write(">")
Console.Write(word.GetString())
Console.WriteLine("</Word>")
word = word.GetNextWord()
End While
Console.WriteLine("</Line>")
line = line.GetNextLine()
End While
If Not cur_flow_id = -1 Then
If Not cur_para_id = -1 Then
cur_para_id = -1
Console.WriteLine("</Para>")
End If
Console.WriteLine("</Flow>")
End If
End If
Console.WriteLine("</PDFText>")
End Using
End Using
Catch ex As PDFNetException
Console.WriteLine(ex.Message)
Catch ex As Exception
MsgBox(ex.Message)
End Try
' Sample code showing how to use low-level text extraction APIs.
If (example5_low_level) Then
Try
' Open the test file
Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
doc.InitSecurityHandler()
Using reader As ElementReader = New ElementReader
' Example 1. Extract all text content from the document
Dim itr As PageIterator = doc.GetPageIterator()
' While itr.HasNext()
reader.Begin(itr.Current())
DumpAllText(reader)
reader.End()
' itr.Next()
' End While
' Example 2. Extract text based on the selection rectangle.
Console.WriteLine("----------------------------------------------------")
Console.WriteLine("Extract text based on the selection rectangle.")
Console.WriteLine("----------------------------------------------------")
Dim first_page As Page = doc.GetPage(1)
Dim field1 As String = ReadTextFromRect(first_page, New Rect(27, 392, 563, 534), reader)
Dim field2 As String = ReadTextFromRect(first_page, New Rect(28, 551, 106, 623), reader)
Dim field3 As String = ReadTextFromRect(first_page, New Rect(208, 550, 387, 621), reader)
Console.WriteLine("Field 1: {0}", field1)
Console.WriteLine("Field 2: {0}", field2)
Console.WriteLine("Field 3: {0}", field3)
' ...
Console.WriteLine("Done.")
End Using
End Using
Catch ex As PDFNetException
Console.WriteLine(ex.Message)
Catch ex As Exception
MsgBox(ex.Message)
End Try
End If
PDFNet.Terminate()
End Sub
Sub PrintStyle(ByRef s As TextExtractor.Style)
Dim RGB As Color = s.GetColor()
Dim rgb_hex As String = String.Format("{0:X02}{1:X02}{2:X02};", RGB.R, RGB.G, RGB.B)
Dim sans_serif_str As String = ""
If s.IsSerif() Then
sans_serif_str = " sans-serif;"
End If
Console.Write(" style=""font-family:{0}; font-size:{1};{2} color:#{3}""", s.GetFontName(), s.GetFontSize(), sans_serif_str, rgb_hex)
End Sub
' LowLevelTextExtractUtils ----------------------------------------
Sub DumpAllText(ByRef reader As ElementReader)
Dim element As Element = reader.Next()
While (Not IsNothing(element)) ' Read page contents
Dim type As Element.Type = element.GetType()
If type = element.Type.e_text_begin Then
Console.WriteLine()
Console.WriteLine("--> Text Block Begin")
ElseIf type = element.Type.e_text_end Then
Console.WriteLine()
Console.WriteLine("--> Text Block End")
ElseIf type = element.Type.e_text Then
Dim bbox As Rect = New Rect
element.GetBBox(bbox)
' Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2)
Dim txt As String = element.GetTextString()
Console.WriteLine(txt)
ElseIf type = element.Type.e_text_new_line Then
' Console.WriteLine()
' Console.WriteLine("--> New Line")
ElseIf type = element.Type.e_form Then
reader.FormBegin() ' Process form XObjects
DumpAllText(reader)
reader.End()
End If
element = reader.Next()
End While
End Sub
Private _srch_str As String
' A helper method for ReadTextFromRect
Sub RectTextSearch(ByRef reader As ElementReader, ByRef pos As Rect)
Dim element As Element = reader.Next()
While (Not IsNothing(element)) ' Read page contents
Dim type As Element.Type = element.GetType()
If type = element.Type.e_text Then
Dim bbox As Rect = New Rect
element.GetBBox(bbox)
If (bbox.IntersectRect(bbox, pos)) Then
Dim txt As String = element.GetTextString()
_srch_str = _srch_str + txt
End If
ElseIf type = element.Type.e_text_new_line Then
ElseIf type = element.Type.e_form Then
reader.FormBegin() ' Process form XObjects
RectTextSearch(reader, pos)
reader.End()
End If
element = reader.Next()
End While
End Sub
' A utility method used to extract all text content from
' a given selection rectangle. The rectangle coordinates are
' expressed in PDF user/page coordinate system.
Function ReadTextFromRect(ByRef page As Page, ByRef pos As Rect, ByRef reader As ElementReader) As String
_srch_str = ""
reader.Begin(page)
RectTextSearch(reader, pos)
reader.End()
Return _srch_str
End Function
End Module