More languages
Some test text!
More languages
Sample VB code for using PDFTron SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library.
Get Started Samples DownloadTo run this sample, get started with a free trial of Apryse SDK.
'
' Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
'
' A sample project illustrating some extraction capabilities of ElementReader
' in more detail
'
Imports System
Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports pdftron.PDF
Module ElementReaderAdvTestVB
Dim pdfNetLoader As PDFNetLoader
Sub New()
pdfNetLoader = pdftron.PDFNetLoader.Instance()
End Sub
Dim m_buf As String
Sub ProcessPath(ByRef reader As ElementReader, ByRef path As Element)
If path.IsClippingPath() Then
Console.WriteLine("This is a clipping path")
End If
Dim pathData As PathData = path.GetPathData()
Dim data As Double() = pathData.points
Dim data_sz As Integer = data.Length
Dim opr As Byte() = pathData.operators
Dim opr_sz As Integer = opr.Length
Dim opr_itr As Integer = 0
Dim opr_end As Integer = opr_sz
Dim data_itr As Integer = 0
Dim data_end As Integer = data_sz
Dim x1, y1, x2, y2, x3, y3 As Double
' Use path.GetCTM() if you are interested in CTM (current transformation matrix).
Console.Write(" Path Data Points := \")
While opr_itr < opr_end
'switch((Element.PathSegmentType)((int)opr[opr_itr]))
If opr(opr_itr) = pathData.PathSegmentType.e_moveto Then
x1 = data(data_itr)
data_itr += 1
y1 = data(data_itr)
data_itr += 1
m_buf = String.Format("M{0:g5} {1:g5}", x1, y1)
Console.Write(m_buf)
ElseIf opr(opr_itr) = pathData.PathSegmentType.e_lineto Then
x1 = data(data_itr)
data_itr += 1
y1 = data(data_itr)
data_itr += 1
m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1)
Console.Write(m_buf)
ElseIf opr(opr_itr) = pathData.PathSegmentType.e_cubicto Then
x1 = data(data_itr)
data_itr += 1
y1 = data(data_itr)
data_itr += 1
x2 = data(data_itr)
data_itr += 1
y2 = data(data_itr)
data_itr += 1
x3 = data(data_itr)
data_itr += 1
y3 = data(data_itr)
data_itr += 1
Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3}
m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", _
coords)
Console.Write(m_buf)
ElseIf opr(opr_itr) = pathData.PathSegmentType.e_rect Then
x1 = data(data_itr)
data_itr += 1
y1 = data(data_itr)
data_itr += 1
Dim w As Double = data(data_itr)
data_itr += 1
Dim h As Double = data(data_itr)
data_itr += 1
x2 = x1 + w
y2 = y1
x3 = x2
y3 = y1 + h
Dim x4 As Double = x1
Dim y4 As Double = y3
Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3, x4, y4}
m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", _
coords)
Console.Write(m_buf)
ElseIf opr(opr_itr) = pathData.PathSegmentType.e_closepath Then
Console.WriteLine(" Close Path")
Else
System.Diagnostics.Debug.Assert(False)
End If
opr_itr += 1
End While
Console.Write(""" ")
Dim gs As GState = path.GetGState()
' Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
If path.IsStroked() Then
Console.WriteLine("Stroke path")
If gs.GetStrokeColorSpace().GetType() = ColorSpace.Type.e_pattern Then
Console.WriteLine("Path has associated pattern")
Else
' Get stroke color (you can use PDFNet color conversion facilities)
' Dim rgb As ColorPt
' gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb)
End If
Else
' Do not stroke path
End If
If path.IsFilled() Then
Console.WriteLine("Fill path")
If gs.GetFillColorSpace().GetType() = ColorSpace.Type.e_pattern Then
Console.WriteLine("Path has associated pattern")
Else
' Dim rgb As ColorPt
' gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb)
End If
Else
' Do not fill path
End If
' Process any changes in graphics state ---------------------------------
Dim gs_itr As GSChangesIterator = reader.GetChangesIterator()
While gs_itr.HasNext()
If gs_itr.Current() = GState.GStateAttribute.e_transform Then
' Get transform matrix for this element. Unlike path.GetCTM()
' that return full transformation matrix gs.GetTransform() return
' only the transformation matrix that was installed for this element.
'
' gs.GetTransform()
ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_width Then
' gs.GetLineWidth()
ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_cap Then
' gs.GetLineCap()
ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_join Then
' gs.GetLineJoin()
ElseIf gs_itr.Current() = GState.GStateAttribute.e_flatness Then
ElseIf gs_itr.Current() = GState.GStateAttribute.e_miter_limit Then
' gs.GetMiterLimit()
ElseIf gs_itr.Current() = GState.GStateAttribute.e_dash_pattern Then
' Dim dashes As Double()
' gs.GetDashes(dashes)
' gs.GetPhase()
End If
gs_itr.Next()
End While
End Sub
Sub ProcessText(ByRef page_reader As ElementReader)
' Begin text element
Console.WriteLine("Begin Text Block:")
Dim element As Element
element = page_reader.Next()
While Not IsNothing(element)
If element.GetType() = element.Type.e_text_end Then
' Finish the text block
Console.WriteLine("End Text Block.")
Return
ElseIf element.GetType() = element.Type.e_text Then
Dim gs As GState = element.GetGState()
Dim cs_fill As ColorSpace = gs.GetFillColorSpace()
Dim fill As ColorPt = gs.GetFillColor()
Dim outc As ColorPt = New ColorPt
cs_fill.Convert2RGB(fill, outc)
Dim cs_stroke As ColorSpace = gs.GetStrokeColorSpace()
Dim stroke As ColorPt = gs.GetStrokeColor()
Dim font As Font = gs.GetFont()
Console.Write("Font Name: ")
Console.Write(font.GetName())
' font.IsFixedWidth()
' font.IsSerif()
' font.IsSymbolic()
' font.IsItalic()
' ...
' Dim word_spacing As Double = gs.GetWordSpacing()
' Dim char_spacing As Double = gs.GetCharSpacing()
' Use element.GetCTM() if you are interested in the CTM
' (current transformation matrix).
Dim ctm As Matrix2D = element.GetCTM()
Dim text_mtx As Matrix2D = element.GetTextMatrix()
Dim mtx As Matrix2D = New Matrix2D
mtx.Set(ctm)
mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
Dim font_sz_scale_factor As Double = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)
Dim font_size As Double = gs.GetFontSize()
Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size)
Dim font_color As ColorPt = gs.GetFillColor()
Dim cs As ColorSpace = gs.GetFillColorSpace()
Dim rgb As ColorPt = New ColorPt
cs.Convert2RGB(font_color, rgb)
Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", _
CByte(rgb.Get(0) * 255), CByte(rgb.Get(1) * 255), CByte(rgb.Get(2) * 255))
Dim x, y As Double
Dim char_code As Integer
Dim itr As CharIterator = element.GetCharIterator()
While itr.HasNext()
Console.Write("Character code: ")
char_code = itr.Current().char_code
Console.Write(Chr(char_code))
x = itr.Current().x ' character positioning information
y = itr.Current().y
' To get the exact character positioning information you need to
' concatenate current text matrix with CTM and then multiply
' relative positioning coordinates with the resulting matrix.
'
mtx.Set(ctm)
mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
mtx.Mult(x, y)
Console.WriteLine(" Position: x={0:f} y={1:f}", x, y)
itr.Next()
End While
Console.WriteLine()
End If
element = page_reader.Next()
End While
End Sub
Sub ProcessImage(ByRef image As Element)
Dim image_mask As Boolean = image.IsImageMask()
Dim interpolate As Boolean = image.IsImageInterpolate()
Dim width As Integer = image.GetImageWidth()
Dim height As Integer = image.GetImageHeight()
Dim out_data_sz As Integer = width * height * 3
Console.WriteLine("Image: width=""{0:d}"" height=""{1:d}""", width, height)
' Dim mtx As Matrix2D = image.GetCTM() ' image matrix (page positioning info)
' You can use GetImageData to read the raw (decoded) image data
'image.GetBitsPerComponent()
'image.GetImageData() ' get raw image data
' .... or use Image2RGB filter that converts every image to RGB format,
' This should save you time since you don't need to deal with color conversions,
' image up-sampling, decoding etc.
Dim img_conv As Image2RGB = New Image2RGB(image) ' Extract and convert image to RGB 8-bpc format
Dim reader As FilterReader = New FilterReader(img_conv)
' A buffer used to keep image data.
Dim image_data_out As Byte() = Nothing '= New Byte(out_data_sz)
reader.Read(image_data_out)
' image_data_out contains RGB image data.
' Note that you don't need to read a whole image at a time. Alternatively
' you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
' until the function returns 0.
End Sub
Sub ProcessElements(ByRef reader As ElementReader)
Dim element As Element = reader.Next()
element = reader.Next()
While Not IsNothing(element) ' Read page contents
If element.GetType() = element.Type.e_path Then
' Process path data...
ProcessPath(reader, element)
ElseIf element.GetType() = element.Type.e_text_begin Then
' Process text strings...
ProcessText(reader)
ElseIf element.GetType() = element.Type.e_form Then
' Process form XObjects
reader.FormBegin()
ProcessElements(reader)
reader.End()
ElseIf element.GetType() = element.Type.e_image Then
' Process Images
ProcessImage(element)
End If
element = reader.Next()
End While
End Sub
Sub Main()
PDFNet.Initialize(PDFTronLicense.Key)
' Relative path to the folder containing test files.
Dim input_path As String = "../../../../TestFiles/"
' Dim output_path As String = "../../../../TestFiles/Output/"
Console.WriteLine("-------------------------------------------------")
Console.WriteLine("Extract page element information from all")
Console.WriteLine("pages in the document.")
' Open the test file
Console.WriteLine("Opening the input file...")
Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
doc.InitSecurityHandler()
Dim pgnum As Integer = doc.GetPageCount()
Dim itr As PageIterator
Using page_reader As ElementReader = New ElementReader
itr = doc.GetPageIterator()
While itr.HasNext() ' Read every page
Console.WriteLine("Page {0:d} ----------------------------------------", _
itr.GetPageNumber())
Dim crop_box As Rect = itr.Current().GetCropBox()
Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2)
Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height())
page_reader.Begin(itr.Current())
ProcessElements(page_reader)
page_reader.End()
itr.Next()
End While
End Using
End Using
PDFNet.Terminate()
Console.WriteLine("Done.")
End Sub
End Module