Some test text!

Discord Logo

Chat with us

PDFTron is now Apryse, learn more here.

PDF logical structure reader in VB

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
VB
C# (Xamarin)

Sample VB code for using PDFTron SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

'
' Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
'

Imports System
Imports System.Collections
Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports pdftron.PDF
Imports pdftron.PDF.Struct

Module LogicalStructureTestCS
    Dim pdfNetLoader As PDFNetLoader
    Sub New()
        pdfNetLoader = pdftron.PDFNetLoader.Instance()
    End Sub

    Sub PrintIndent(ByVal indent As Integer)
        Console.WriteLine()

        For i As Integer = 0 To indent - 1
            Console.Write("  ")
        Next
    End Sub

    Sub ProcessStructElement(ByVal element As SElement, ByVal indent As Integer)
        If Not element.IsValid() Then
            Return
        End If

        PrintIndent(Math.Min(System.Threading.Interlocked.Increment(indent), indent - 1))
        Console.Write("Type: " & element.[GetType]())

        If element.HasTitle() Then
            Console.Write(". Title: " & element.GetTitle())
        End If

        Dim num As Integer = element.GetNumKids()

        For i As Integer = 0 To num - 1

            If element.IsContentItem(i) Then
                Dim cont As ContentItem = element.GetAsContentItem(i)
                Dim type As ContentItem.Type = cont.[GetType]()
                Dim page As Page = cont.GetPage()
                PrintIndent(indent)
                Console.Write("Content Item. Part of page #" & page.GetIndex())
                PrintIndent(indent)

                Select Case type
                    Case ContentItem.Type.e_MCID, ContentItem.Type.e_MCR
                        Console.Write("MCID: " & cont.GetMCID())
                    Case ContentItem.Type.e_OBJR
                        Console.Write("OBJR ")
                        Dim ref_obj As Obj = cont.GetRefObj()
                        If ref_obj IsNot Nothing Then Console.Write("- Referenced Object#: " & ref_obj.GetObjNum())
                    Case Else
                End Select
            Else
                ProcessStructElement(element.GetAsStructElem(i), indent)
            End If
        Next
    End Sub

    Sub ProcessElements(ByVal reader As ElementReader)
        Dim element As Element = reader.Next()
        While Not IsNothing(element)  ' Read page contents
            Dim type As Element.Type = element.[GetType]()

            If type = element.Type.e_path OrElse type = element.Type.e_text OrElse type = element.Type.e_path Then

                Select Case type
                    Case element.Type.e_path
                        Console.WriteLine()
                        Console.Write("PATH: ")
                    Case element.Type.e_text
                        Console.WriteLine()
                        Console.WriteLine("TEXT: " & element.GetTextString())
                    Case element.Type.e_form
                        Console.WriteLine()
                        Console.Write("FORM XObject: ")
                End Select

                Dim struct_parent As SElement = element.GetParentStructElement()

                If struct_parent.IsValid() Then
                    Console.Write(" Type: " & struct_parent.[GetType]() & ", MCID: " + String.Format("{0}", element.GetStructMCID()))

                    If struct_parent.HasTitle() Then
                        Console.Write(". Title: " & struct_parent.GetTitle())
                    End If

                    Console.Write(", Obj#: " & struct_parent.GetSDFObj().GetObjNum())
                End If
            End If
            element = reader.Next()
        End While
    End Sub

    Sub ProcessElements2(ByVal reader As ElementReader, ByVal mcid_page_map As Hashtable)
        Dim element As Element = reader.Next()
        While Not IsNothing(element)  ' Read page contents
            Dim mcid As Integer = element.GetStructMCID()

            If mcid >= 0 AndAlso element.[GetType]() = element.Type.e_text Then
                Dim val As String = element.GetTextString()

                If mcid_page_map.ContainsKey(mcid) Then
                    mcid_page_map(mcid) = (CStr((mcid_page_map(mcid))) & val)
                Else
                    mcid_page_map.Add(mcid, val)
                End If
            End If
            element = reader.Next()
        End While
    End Sub

    Sub ProcessStructElement2(ByVal element As SElement, ByVal mcid_doc_map As Hashtable, ByVal indent As Integer)
        If Not element.IsValid() Then
            Return
        End If

        PrintIndent(indent)
        Console.Write("<" & element.[GetType]())

        If element.HasTitle() Then
            Console.Write(" title=""" & element.GetTitle() & """")
        End If

        Console.Write(">")
        Dim num As Integer = element.GetNumKids()

        For i As Integer = 0 To num - 1

            If element.IsContentItem(i) Then
                Dim cont As ContentItem = element.GetAsContentItem(i)

                If cont.[GetType]() = ContentItem.Type.e_MCID Then
                    Dim page_num As Integer = cont.GetPage().GetIndex()

                    If mcid_doc_map.ContainsKey(page_num) Then
                        Dim mcid_page_map As Hashtable = CType((mcid_doc_map(page_num)), Hashtable)
                        Dim mcid As Integer = cont.GetMCID()

                        If mcid_page_map.ContainsKey(mcid) Then
                            Console.Write(mcid_page_map(mcid))
                        End If
                    End If
                End If
            Else
                ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent + 1)
            End If
        Next

        PrintIndent(indent)
        Console.Write("</" & element.[GetType]() & ">")
    End Sub


    Sub Main(ByVal args As String())
        PDFNet.Initialize(PDFTronLicense.Key)
        Dim input_path As String = "../../../../TestFiles/"
        Dim output_path As String = "../../../../TestFiles/Output/"

        Try

            Using doc As PDFDoc = New PDFDoc(input_path & "tagged.pdf")
                doc.InitSecurityHandler()
                Dim example1 As Boolean = True
                Dim example2 As Boolean = True
                Dim example3 As Boolean = True

                If example1 Then
                    Console.WriteLine("____________________________________________________________")
                    Console.WriteLine("Sample 1 - Traverse logical structure tree...")
                    Dim tree As STree = doc.GetStructTree()

                    If tree.IsValid() Then
                        Console.WriteLine("Document has a StructTree root.")

                        For i As Integer = 0 To tree.GetNumKids() - 1
                            ProcessStructElement(tree.GetKid(i), 0)
                        Next
                    Else
                        Console.WriteLine("This document does not contain any logical structure.")
                    End If

                    Console.WriteLine()
                    Console.WriteLine("Done 1.")
                End If

                If example2 Then
                    Console.WriteLine("____________________________________________________________")
                    Console.WriteLine("Sample 2 - Get parent logical structure elements from")
                    Console.WriteLine("layout elements.")
                    Dim reader As ElementReader = New ElementReader()
                    Dim itr As PageIterator = doc.GetPageIterator()

                    While itr.HasNext()
                        reader.Begin(itr.Current())
                        ProcessElements(reader)
                        reader.[End]()
                        itr.[Next]()
                    End While

                    Console.WriteLine()
                    Console.WriteLine("Done 2.")
                End If

                If example3 Then
                    Console.WriteLine("____________________________________________________________")
                    Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
                    Dim mcid_doc_map As Hashtable = New Hashtable()
                    Dim reader As ElementReader = New ElementReader()
                    Dim itr As PageIterator = doc.GetPageIterator()

                    While itr.HasNext()
                        Dim pg As Page = itr.Current()
                        reader.Begin(pg)
                        Dim page_mcid_map As Hashtable = New Hashtable()
                        mcid_doc_map.Add(pg.GetIndex(), page_mcid_map)
                        ProcessElements2(reader, page_mcid_map)
                        reader.[End]()
                        itr.[Next]()
                    End While

                    Dim tree As STree = doc.GetStructTree()

                    If tree.IsValid() Then

                        For i As Integer = 0 To tree.GetNumKids() - 1
                            ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
                        Next
                    End If

                    Console.WriteLine()
                    Console.WriteLine("Done 3.")
                End If

                doc.Save(output_path & "LogicalStructure.pdf", 0)
            End Using

        Catch e As PDFNetException
            Console.WriteLine(e.Message)
        End Try
        PDFNet.Terminate()
    End Sub

End Module