More languages
Some test text!
More languages
Sample VB code for using PDFTron SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our VB PDF Library and PDF Parsing & Content Extraction Library.
Get Started Samples DownloadTo run this sample, get started with a free trial of Apryse SDK.
'
' Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
'
Imports System
Imports System.Collections
Imports pdftron
Imports pdftron.Common
Imports pdftron.Filters
Imports pdftron.SDF
Imports pdftron.PDF
Imports pdftron.PDF.Struct
Module LogicalStructureTestCS
Dim pdfNetLoader As PDFNetLoader
Sub New()
pdfNetLoader = pdftron.PDFNetLoader.Instance()
End Sub
Sub PrintIndent(ByVal indent As Integer)
Console.WriteLine()
For i As Integer = 0 To indent - 1
Console.Write(" ")
Next
End Sub
Sub ProcessStructElement(ByVal element As SElement, ByVal indent As Integer)
If Not element.IsValid() Then
Return
End If
PrintIndent(Math.Min(System.Threading.Interlocked.Increment(indent), indent - 1))
Console.Write("Type: " & element.[GetType]())
If element.HasTitle() Then
Console.Write(". Title: " & element.GetTitle())
End If
Dim num As Integer = element.GetNumKids()
For i As Integer = 0 To num - 1
If element.IsContentItem(i) Then
Dim cont As ContentItem = element.GetAsContentItem(i)
Dim type As ContentItem.Type = cont.[GetType]()
Dim page As Page = cont.GetPage()
PrintIndent(indent)
Console.Write("Content Item. Part of page #" & page.GetIndex())
PrintIndent(indent)
Select Case type
Case ContentItem.Type.e_MCID, ContentItem.Type.e_MCR
Console.Write("MCID: " & cont.GetMCID())
Case ContentItem.Type.e_OBJR
Console.Write("OBJR ")
Dim ref_obj As Obj = cont.GetRefObj()
If ref_obj IsNot Nothing Then Console.Write("- Referenced Object#: " & ref_obj.GetObjNum())
Case Else
End Select
Else
ProcessStructElement(element.GetAsStructElem(i), indent)
End If
Next
End Sub
Sub ProcessElements(ByVal reader As ElementReader)
Dim element As Element = reader.Next()
While Not IsNothing(element) ' Read page contents
Dim type As Element.Type = element.[GetType]()
If type = element.Type.e_path OrElse type = element.Type.e_text OrElse type = element.Type.e_path Then
Select Case type
Case element.Type.e_path
Console.WriteLine()
Console.Write("PATH: ")
Case element.Type.e_text
Console.WriteLine()
Console.WriteLine("TEXT: " & element.GetTextString())
Case element.Type.e_form
Console.WriteLine()
Console.Write("FORM XObject: ")
End Select
Dim struct_parent As SElement = element.GetParentStructElement()
If struct_parent.IsValid() Then
Console.Write(" Type: " & struct_parent.[GetType]() & ", MCID: " + String.Format("{0}", element.GetStructMCID()))
If struct_parent.HasTitle() Then
Console.Write(". Title: " & struct_parent.GetTitle())
End If
Console.Write(", Obj#: " & struct_parent.GetSDFObj().GetObjNum())
End If
End If
element = reader.Next()
End While
End Sub
Sub ProcessElements2(ByVal reader As ElementReader, ByVal mcid_page_map As Hashtable)
Dim element As Element = reader.Next()
While Not IsNothing(element) ' Read page contents
Dim mcid As Integer = element.GetStructMCID()
If mcid >= 0 AndAlso element.[GetType]() = element.Type.e_text Then
Dim val As String = element.GetTextString()
If mcid_page_map.ContainsKey(mcid) Then
mcid_page_map(mcid) = (CStr((mcid_page_map(mcid))) & val)
Else
mcid_page_map.Add(mcid, val)
End If
End If
element = reader.Next()
End While
End Sub
Sub ProcessStructElement2(ByVal element As SElement, ByVal mcid_doc_map As Hashtable, ByVal indent As Integer)
If Not element.IsValid() Then
Return
End If
PrintIndent(indent)
Console.Write("<" & element.[GetType]())
If element.HasTitle() Then
Console.Write(" title=""" & element.GetTitle() & """")
End If
Console.Write(">")
Dim num As Integer = element.GetNumKids()
For i As Integer = 0 To num - 1
If element.IsContentItem(i) Then
Dim cont As ContentItem = element.GetAsContentItem(i)
If cont.[GetType]() = ContentItem.Type.e_MCID Then
Dim page_num As Integer = cont.GetPage().GetIndex()
If mcid_doc_map.ContainsKey(page_num) Then
Dim mcid_page_map As Hashtable = CType((mcid_doc_map(page_num)), Hashtable)
Dim mcid As Integer = cont.GetMCID()
If mcid_page_map.ContainsKey(mcid) Then
Console.Write(mcid_page_map(mcid))
End If
End If
End If
Else
ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent + 1)
End If
Next
PrintIndent(indent)
Console.Write("</" & element.[GetType]() & ">")
End Sub
Sub Main(ByVal args As String())
PDFNet.Initialize(PDFTronLicense.Key)
Dim input_path As String = "../../../../TestFiles/"
Dim output_path As String = "../../../../TestFiles/Output/"
Try
Using doc As PDFDoc = New PDFDoc(input_path & "tagged.pdf")
doc.InitSecurityHandler()
Dim example1 As Boolean = True
Dim example2 As Boolean = True
Dim example3 As Boolean = True
If example1 Then
Console.WriteLine("____________________________________________________________")
Console.WriteLine("Sample 1 - Traverse logical structure tree...")
Dim tree As STree = doc.GetStructTree()
If tree.IsValid() Then
Console.WriteLine("Document has a StructTree root.")
For i As Integer = 0 To tree.GetNumKids() - 1
ProcessStructElement(tree.GetKid(i), 0)
Next
Else
Console.WriteLine("This document does not contain any logical structure.")
End If
Console.WriteLine()
Console.WriteLine("Done 1.")
End If
If example2 Then
Console.WriteLine("____________________________________________________________")
Console.WriteLine("Sample 2 - Get parent logical structure elements from")
Console.WriteLine("layout elements.")
Dim reader As ElementReader = New ElementReader()
Dim itr As PageIterator = doc.GetPageIterator()
While itr.HasNext()
reader.Begin(itr.Current())
ProcessElements(reader)
reader.[End]()
itr.[Next]()
End While
Console.WriteLine()
Console.WriteLine("Done 2.")
End If
If example3 Then
Console.WriteLine("____________________________________________________________")
Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
Dim mcid_doc_map As Hashtable = New Hashtable()
Dim reader As ElementReader = New ElementReader()
Dim itr As PageIterator = doc.GetPageIterator()
While itr.HasNext()
Dim pg As Page = itr.Current()
reader.Begin(pg)
Dim page_mcid_map As Hashtable = New Hashtable()
mcid_doc_map.Add(pg.GetIndex(), page_mcid_map)
ProcessElements2(reader, page_mcid_map)
reader.[End]()
itr.[Next]()
End While
Dim tree As STree = doc.GetStructTree()
If tree.IsValid() Then
For i As Integer = 0 To tree.GetNumKids() - 1
ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
Next
End If
Console.WriteLine()
Console.WriteLine("Done 3.")
End If
doc.Save(output_path & "LogicalStructure.pdf", 0)
End Using
Catch e As PDFNetException
Console.WriteLine(e.Message)
End Try
PDFNet.Terminate()
End Sub
End Module