Some test text!

Discord Logo

Chat with us

PDFTron is now Apryse, learn more here.

PDF data extraction in VB (images, text, paths)

More languages

More languages
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample VB code for using PDFTron SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

'
' Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
'
' A sample project illustrating some extraction capabilities of ElementReader
' in more detail
'

Imports System

Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports pdftron.PDF

Module ElementReaderAdvTestVB
    Dim pdfNetLoader As PDFNetLoader
    Sub New()
        pdfNetLoader = pdftron.PDFNetLoader.Instance()
    End Sub

    Dim m_buf As String

    Sub ProcessPath(ByRef reader As ElementReader, ByRef path As Element)
        If path.IsClippingPath() Then
            Console.WriteLine("This is a clipping path")
        End If

        Dim pathData As PathData = path.GetPathData()
        Dim data As Double() = pathData.points
        Dim data_sz As Integer = data.Length

        Dim opr As Byte() = pathData.operators
        Dim opr_sz As Integer = opr.Length

        Dim opr_itr As Integer = 0
        Dim opr_end As Integer = opr_sz
        Dim data_itr As Integer = 0
        Dim data_end As Integer = data_sz
        Dim x1, y1, x2, y2, x3, y3 As Double

        ' Use path.GetCTM() if you are interested in CTM (current transformation matrix).

        Console.Write(" Path Data Points := \")
        While opr_itr < opr_end
            'switch((Element.PathSegmentType)((int)opr[opr_itr]))
            If opr(opr_itr) = pathData.PathSegmentType.e_moveto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                m_buf = String.Format("M{0:g5} {1:g5}", x1, y1)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_lineto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_cubicto Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                x2 = data(data_itr)
                data_itr += 1
                y2 = data(data_itr)
                data_itr += 1
                x3 = data(data_itr)
                data_itr += 1
                y3 = data(data_itr)
                data_itr += 1
                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3}
                m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", _
                 coords)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_rect Then
                x1 = data(data_itr)
                data_itr += 1
                y1 = data(data_itr)
                data_itr += 1
                Dim w As Double = data(data_itr)
                data_itr += 1
                Dim h As Double = data(data_itr)
                data_itr += 1
                x2 = x1 + w
                y2 = y1
                x3 = x2
                y3 = y1 + h
                Dim x4 As Double = x1
                Dim y4 As Double = y3
                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3, x4, y4}
                m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", _
                 coords)
                Console.Write(m_buf)
            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_closepath Then
                Console.WriteLine(" Close Path")
            Else
                System.Diagnostics.Debug.Assert(False)
            End If

            opr_itr += 1
        End While

        Console.Write(""" ")

        Dim gs As GState = path.GetGState()

        ' Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
        If path.IsStroked() Then
            Console.WriteLine("Stroke path")
            If gs.GetStrokeColorSpace().GetType() = ColorSpace.Type.e_pattern Then
                Console.WriteLine("Path has associated pattern")
            Else
                ' Get stroke color (you can use PDFNet color conversion facilities)
                ' Dim rgb As ColorPt
                ' gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb)
            End If
        Else
            ' Do not stroke path
        End If

        If path.IsFilled() Then
            Console.WriteLine("Fill path")

            If gs.GetFillColorSpace().GetType() = ColorSpace.Type.e_pattern Then
                Console.WriteLine("Path has associated pattern")
            Else
                ' Dim rgb As ColorPt
                ' gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb)
            End If
        Else
            ' Do not fill path
        End If

        ' Process any changes in graphics state  ---------------------------------
        Dim gs_itr As GSChangesIterator = reader.GetChangesIterator()
        While gs_itr.HasNext()
            If gs_itr.Current() = GState.GStateAttribute.e_transform Then
                ' Get transform matrix for this element. Unlike path.GetCTM() 
                ' that return full transformation matrix gs.GetTransform() return 
                ' only the transformation matrix that was installed for this element.
                '
                ' gs.GetTransform()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_width Then
                ' gs.GetLineWidth()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_cap Then
                ' gs.GetLineCap()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_join Then
                ' gs.GetLineJoin()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_flatness Then
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_miter_limit Then
                ' gs.GetMiterLimit()
            ElseIf gs_itr.Current() = GState.GStateAttribute.e_dash_pattern Then
                ' Dim dashes As Double()
                ' gs.GetDashes(dashes)
                ' gs.GetPhase()
            End If

            gs_itr.Next()
        End While
    End Sub

    Sub ProcessText(ByRef page_reader As ElementReader)
        ' Begin text element
        Console.WriteLine("Begin Text Block:")

        Dim element As Element
        element = page_reader.Next()
        While Not IsNothing(element)
            If element.GetType() = element.Type.e_text_end Then
                ' Finish the text block
                Console.WriteLine("End Text Block.")
                Return
            ElseIf element.GetType() = element.Type.e_text Then
                Dim gs As GState = element.GetGState()

                Dim cs_fill As ColorSpace = gs.GetFillColorSpace()
                Dim fill As ColorPt = gs.GetFillColor()

                Dim outc As ColorPt = New ColorPt
                cs_fill.Convert2RGB(fill, outc)

                Dim cs_stroke As ColorSpace = gs.GetStrokeColorSpace()
                Dim stroke As ColorPt = gs.GetStrokeColor()

                Dim font As Font = gs.GetFont()

                Console.Write("Font Name: ")
                Console.Write(font.GetName())
                ' font.IsFixedWidth()
                ' font.IsSerif()
                ' font.IsSymbolic()
                ' font.IsItalic()
                ' ... 

                ' Dim word_spacing As Double = gs.GetWordSpacing()
                ' Dim char_spacing As Double = gs.GetCharSpacing()

                ' Use element.GetCTM() if you are interested in the CTM 
                ' (current transformation matrix).
                Dim ctm As Matrix2D = element.GetCTM()

                Dim text_mtx As Matrix2D = element.GetTextMatrix()

                Dim mtx As Matrix2D = New Matrix2D
                mtx.Set(ctm)
                mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
                Dim font_sz_scale_factor As Double = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)
                Dim font_size As Double = gs.GetFontSize()
                Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size)

                Dim font_color As ColorPt = gs.GetFillColor()
                Dim cs As ColorSpace = gs.GetFillColorSpace()

                Dim rgb As ColorPt = New ColorPt
                cs.Convert2RGB(font_color, rgb)

                Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", _
                    CByte(rgb.Get(0) * 255), CByte(rgb.Get(1) * 255), CByte(rgb.Get(2) * 255))

                Dim x, y As Double
                Dim char_code As Integer

                Dim itr As CharIterator = element.GetCharIterator()
                While itr.HasNext()
                    Console.Write("Character code: ")
                    char_code = itr.Current().char_code
                    Console.Write(Chr(char_code))

                    x = itr.Current().x      ' character positioning information
                    y = itr.Current().y

                    ' To get the exact character positioning information you need to 
                    ' concatenate current text matrix with CTM and then multiply 
                    ' relative positioning coordinates with the resulting matrix.
                    '
                    mtx.Set(ctm)
                    mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
                    mtx.Mult(x, y)
                    Console.WriteLine(" Position: x={0:f} y={1:f}", x, y)
                    itr.Next()
                End While

                Console.WriteLine()
            End If
            element = page_reader.Next()
        End While
    End Sub

    Sub ProcessImage(ByRef image As Element)
        Dim image_mask As Boolean = image.IsImageMask()
        Dim interpolate As Boolean = image.IsImageInterpolate()
        Dim width As Integer = image.GetImageWidth()
        Dim height As Integer = image.GetImageHeight()
        Dim out_data_sz As Integer = width * height * 3

        Console.WriteLine("Image: width=""{0:d}"" height=""{1:d}""", width, height)

        ' Dim mtx As Matrix2D = image.GetCTM() ' image matrix (page positioning info)

        ' You can use GetImageData to read the raw (decoded) image data
        'image.GetBitsPerComponent()    
        'image.GetImageData()    ' get raw image data
        ' .... or use Image2RGB filter that converts every image to RGB format,
        ' This should save you time since you don't need to deal with color conversions, 
        ' image up-sampling, decoding etc.

        Dim img_conv As Image2RGB = New Image2RGB(image)       ' Extract and convert image to RGB 8-bpc format
        Dim reader As FilterReader = New FilterReader(img_conv)

        ' A buffer used to keep image data.
        Dim image_data_out As Byte() = Nothing       '= New Byte(out_data_sz)

        reader.Read(image_data_out)
        ' image_data_out contains RGB image data.

        ' Note that you don't need to read a whole image at a time. Alternatively
        ' you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
        ' until the function returns 0. 
    End Sub

    Sub ProcessElements(ByRef reader As ElementReader)
        Dim element As Element = reader.Next()

        element = reader.Next()
        While Not IsNothing(element)         ' Read page contents
            If element.GetType() = element.Type.e_path Then
                ' Process path data...
                ProcessPath(reader, element)
            ElseIf element.GetType() = element.Type.e_text_begin Then
                ' Process text strings...
                ProcessText(reader)
            ElseIf element.GetType() = element.Type.e_form Then
                ' Process form XObjects
                reader.FormBegin()
                ProcessElements(reader)
                reader.End()
            ElseIf element.GetType() = element.Type.e_image Then
                ' Process Images
                ProcessImage(element)
            End If
            element = reader.Next()
        End While
    End Sub

    Sub Main()

        PDFNet.Initialize(PDFTronLicense.Key)

        ' Relative path to the folder containing test files.
        Dim input_path As String = "../../../../TestFiles/"
        ' Dim output_path As String = "../../../../TestFiles/Output/"

        Console.WriteLine("-------------------------------------------------")
        Console.WriteLine("Extract page element information from all")
        Console.WriteLine("pages in the document.")

        ' Open the test file
        Console.WriteLine("Opening the input file...")
        Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
            doc.InitSecurityHandler()

            Dim pgnum As Integer = doc.GetPageCount()

            Dim itr As PageIterator
            Using page_reader As ElementReader = New ElementReader
                itr = doc.GetPageIterator()
                While itr.HasNext()    '  Read every page
                    Console.WriteLine("Page {0:d} ----------------------------------------", _
                     itr.GetPageNumber())

                    Dim crop_box As Rect = itr.Current().GetCropBox()
                    Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2)
                    Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height())

                    page_reader.Begin(itr.Current())
                    ProcessElements(page_reader)
                    page_reader.End()
                    itr.Next()
                End While
            End Using
        End Using
        PDFNet.Terminate()
        Console.WriteLine("Done.")

    End Sub

End Module