Some test text!

Search
Hamburger Icon

Read a PDF file in VB (parse & extract text)

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample VB code for using PDFTron SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

'
' Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
'

Imports System
Imports System.Drawing
Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports PDFTRON.PDF

' This sample illustrates various text extraction capabilities of PDFNet.

Module TextExtractTestVB
	Dim pdfNetLoader As PDFNetLoader
	Sub New()
		pdfNetLoader = pdftron.PDFNetLoader.Instance()
	End Sub

	Sub Main()

		PDFNet.Initialize(PDFTronLicense.Key)

		' Relative path to the folder containing test files.
		Dim input_path As String = "../../../../TestFiles/"

		Dim example1_basic As Boolean = False
		Dim example2_xml As Boolean = False
		Dim example3_wordlist As Boolean = False
		Dim example4_advanced As Boolean = True
		Dim example5_low_level As Boolean = False

		' Sample code showing how to use high-level text extraction APIs.
		Try
			Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
				doc.InitSecurityHandler()

				Dim pg As Page = doc.GetPage(1)
				If pg Is Nothing Then
					Console.WriteLine("Page not found.")
					Return
				End If

				Using txt As TextExtractor = New TextExtractor
					txt.Begin(pg)	 ' Read the page.
					' Other options you may want to consider...
					' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_no_dup_remove)
					' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_remove_hidden_text)
					' ...

					' Example 1. Get all text on the page in a single string.
					' Words will be separated with space or new line characters.
					If example1_basic Then
						' Get the word count.
						Console.WriteLine("Word Count: {0}", txt.GetWordCount())

						Console.WriteLine("")
						Console.WriteLine("- GetAsText --------------------------")
						Console.WriteLine(txt.GetAsText())
						Console.WriteLine("-----------------------------------------------------------")
					End If


					' Example 2. Get XML logical structure for the page.
					If example2_xml Then
						Console.WriteLine("")
						Console.WriteLine("- GetAsXML  --------------------------")
						Console.WriteLine(txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements Or TextExtractor.XMLOutputFlags.e_output_bbox Or TextExtractor.XMLOutputFlags.e_output_style_info))
						Console.WriteLine("-----------------------------------------------------------")
					End If


					If example3_wordlist Then
						Dim word As TextExtractor.Word
						Dim line As TextExtractor.Line = txt.GetFirstLine()
						While line.IsValid()
							word = line.GetFirstWord()
							While word.IsValid()
								Console.WriteLine(word.GetString())
								word = word.GetNextWord()
							End While
							line = line.GetNextLine()
						End While
						Console.WriteLine("-----------------------------------------------------------")
					End If


					' Example 3. A more advanced text extraction example. 
					' The output is XML structure containing paragraphs, lines, words, 
					' as well as style and positioning information.
					If example4_advanced Then
						Dim bbox As Rect
						Dim cur_flow_id As Integer = -1
						Dim cur_para_id As Integer = -1

						Dim line As TextExtractor.Line
						Dim word As TextExtractor.Word
						Dim s As TextExtractor.Style
						Dim line_style As TextExtractor.Style
						Console.WriteLine("<PDFText>")
						' For each line on the page...
						line = txt.GetFirstLine()

						While line.IsValid()
							If Not cur_flow_id = line.GetFlowID() Then
								If Not cur_flow_id = -1 Then
									If Not cur_para_id = -1 Then
										cur_para_id = -1
										Console.WriteLine("</Para>")
									End If
									Console.WriteLine("</Flow>")
								End If
								cur_flow_id = line.GetFlowID()
								Console.WriteLine("<Flow id=""{0}"">", cur_flow_id)
							End If

							If Not cur_para_id = line.GetParagraphID() Then
								If Not cur_para_id = -1 Then
									Console.WriteLine("</Para>")
								End If
								cur_para_id = line.GetParagraphID()
								Console.WriteLine("<Para id=""{0}"">", cur_para_id)
							End If

							bbox = line.GetBBox()
							line_style = line.GetStyle()
							Console.Write("<Line box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
							PrintStyle(line_style)
							Console.Write(" cur_num=""" & line.GetCurrentNum() & """")
							Console.WriteLine(">")

							' For each word in the line...
							word = line.GetFirstWord()
							While word.IsValid()
								' Output the bounding box for the word.
								bbox = word.GetBBox()
								Console.Write("<Word box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
								Console.Write(" cur_num=""" & word.GetCurrentNum() & """")
								Dim sz As Integer = word.GetStringLen()
								If (sz = 0) Then Continue While
								' If the word style is different from the parent style, output the new style.
								s = word.GetStyle()
								If Not s.Equals(line_style) Then
									PrintStyle(s)
								End If

								Console.Write(">")
								Console.Write(word.GetString())
								Console.WriteLine("</Word>")
								word = word.GetNextWord()
							End While

							Console.WriteLine("</Line>")
							line = line.GetNextLine()
						End While

						If Not cur_flow_id = -1 Then
							If Not cur_para_id = -1 Then
								cur_para_id = -1
								Console.WriteLine("</Para>")
							End If
							Console.WriteLine("</Flow>")
						End If
					End If

					Console.WriteLine("</PDFText>")
				End Using
			End Using
		Catch ex As PDFNetException
			Console.WriteLine(ex.Message)
		Catch ex As Exception
			MsgBox(ex.Message)
		End Try



		' Sample code showing how to use low-level text extraction APIs.
		If (example5_low_level) Then

			Try
				' Open the test file
				Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
					doc.InitSecurityHandler()

					Using reader As ElementReader = New ElementReader

						' Example 1. Extract all text content from the document
						Dim itr As PageIterator = doc.GetPageIterator()
						' While itr.HasNext()
						reader.Begin(itr.Current())
						DumpAllText(reader)
						reader.End()
						'   itr.Next()
						' End While

						' Example 2. Extract text based on the selection rectangle.
						Console.WriteLine("----------------------------------------------------")
						Console.WriteLine("Extract text based on the selection rectangle.")
						Console.WriteLine("----------------------------------------------------")

						Dim first_page As Page = doc.GetPage(1)
						Dim field1 As String = ReadTextFromRect(first_page, New Rect(27, 392, 563, 534), reader)
						Dim field2 As String = ReadTextFromRect(first_page, New Rect(28, 551, 106, 623), reader)
						Dim field3 As String = ReadTextFromRect(first_page, New Rect(208, 550, 387, 621), reader)

						Console.WriteLine("Field 1: {0}", field1)
						Console.WriteLine("Field 2: {0}", field2)
						Console.WriteLine("Field 3: {0}", field3)
						' ... 

						Console.WriteLine("Done.")
					End Using
				End Using

			Catch ex As PDFNetException
				Console.WriteLine(ex.Message)
			Catch ex As Exception
				MsgBox(ex.Message)
			End Try
		End If
		PDFNet.Terminate()
	End Sub


	Sub PrintStyle(ByRef s As TextExtractor.Style)
		Dim RGB As Color = s.GetColor()
		Dim rgb_hex As String = String.Format("{0:X02}{1:X02}{2:X02};", RGB.R, RGB.G, RGB.B)
		Dim sans_serif_str As String = ""
		If s.IsSerif() Then
			sans_serif_str = " sans-serif;"
		End If
		Console.Write(" style=""font-family:{0}; font-size:{1};{2} color:#{3}""", s.GetFontName(), s.GetFontSize(), sans_serif_str, rgb_hex)
	End Sub

	' LowLevelTextExtractUtils ----------------------------------------

	Sub DumpAllText(ByRef reader As ElementReader)
		Dim element As Element = reader.Next()
		While (Not IsNothing(element))		 ' Read page contents
			Dim type As Element.Type = element.GetType()

			If type = element.Type.e_text_begin Then
				Console.WriteLine()
				Console.WriteLine("--> Text Block Begin")
			ElseIf type = element.Type.e_text_end Then
				Console.WriteLine()
				Console.WriteLine("--> Text Block End")
			ElseIf type = element.Type.e_text Then
				Dim bbox As Rect = New Rect
				element.GetBBox(bbox)
				' Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2)

				Dim txt As String = element.GetTextString()
				Console.WriteLine(txt)
			ElseIf type = element.Type.e_text_new_line Then
				' Console.WriteLine()
				' Console.WriteLine("--> New Line")
			ElseIf type = element.Type.e_form Then
				reader.FormBegin()				' Process form XObjects
				DumpAllText(reader)
				reader.End()
			End If

			element = reader.Next()
		End While
	End Sub

	Private _srch_str As String

	' A helper method for ReadTextFromRect
	Sub RectTextSearch(ByRef reader As ElementReader, ByRef pos As Rect)
		Dim element As Element = reader.Next()
		While (Not IsNothing(element))		 ' Read page contents
			Dim type As Element.Type = element.GetType()

			If type = element.Type.e_text Then
				Dim bbox As Rect = New Rect
				element.GetBBox(bbox)

				If (bbox.IntersectRect(bbox, pos)) Then
					Dim txt As String = element.GetTextString()
					_srch_str = _srch_str + txt
				End If
			ElseIf type = element.Type.e_text_new_line Then
			ElseIf type = element.Type.e_form Then
				reader.FormBegin()				   ' Process form XObjects
				RectTextSearch(reader, pos)
				reader.End()
			End If

			element = reader.Next()
		End While
	End Sub


	' A utility method used to extract all text content from
	' a given selection rectangle. The rectangle coordinates are
	' expressed in PDF user/page coordinate system.
	Function ReadTextFromRect(ByRef page As Page, ByRef pos As Rect, ByRef reader As ElementReader) As String
		_srch_str = ""
		reader.Begin(page)
		RectTextSearch(reader, pos)
		reader.End()
		Return _srch_str
	End Function

End Module