Some test text!

Discord Logo

Chat with us

PDFTron is now Apryse, learn more here.

PDF logical structure reader in Go

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
VB
C# (Xamarin)

Sample Go code for using PDFTron SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Go PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
// Consult LICENSE.txt regarding license information.
//---------------------------------------------------------------------------------------

package main
import (
	"fmt"
    "strconv"
    "os"
	. "pdftron"
)

import  "pdftron/Samples/LicenseKey/GO"

//---------------------------------------------------------------------------------------
// This sample explores the structure and content of a tagged PDF document and dumps 
// the structure information to the console window.
//
// In tagged PDF documents StructTree acts as a central repository for information 
// related to a PDF document's logical structure. The tree consists of StructElement-s
// and ContentItem-s which are leaf nodes of the structure tree.
//
// The sample can be extended to access and extract the marked-content elements such 
// as text and images.
//---------------------------------------------------------------------------------------

func PrintIndent(indent int){
   os.Stdout.Write([]byte("\n"))
   i := 0
    for i < indent{
        os.Stdout.Write([]byte("  "))
        i = i + 1
    }
}

func ProcessStructElement(element SElement, indent int){
    if !element.IsValid(){
        return
    }

    // Print out the type and title info, if any.
    PrintIndent(indent)
    indent = indent + 1
    os.Stdout.Write([]byte("Type: " + element.GetType()))
    if element.HasTitle(){
        os.Stdout.Write([]byte(". Title:" + element.GetTitle()))
    }
    num := element.GetNumKids()
    i := 0
    for i < num{
        // Check if the kid is a leaf node (i.e. it is a ContentItem)
        if element.IsContentItem(i){
            cont := element.GetAsContentItem(i)
            etype := cont.GetType()
            
            page := cont.GetPage()
            
            PrintIndent(indent)
            os.Stdout.Write([]byte("Content Item. Part of page //" + strconv.Itoa(page.GetIndex())))
            PrintIndent(indent)
            if etype == ContentItemE_MCID{
                os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
            }else if etype == ContentItemE_MCR{
                os.Stdout.Write([]byte("MCID: " + strconv.Itoa(cont.GetMCID())))
            }else if etype == ContentItemE_OBJR{
                os.Stdout.Write([]byte("OBJR "))
                refObj := cont.GetRefObj()
                if refObj != nil{
                    os.Stdout.Write([]byte("- Referenced Object//: " + strconv.Itoa(int(refObj.GetObjNum()))))
                }
            }
        }else{
            ProcessStructElement(element.GetAsStructElem(i), indent)
        }
        i = i + 1
    }
}    

// Used in code snippet 3.
func ProcessElements2(reader ElementReader, mcidPageMap map[int]string){
    element := reader.Next()
    for element.GetMp_elem().Swigcptr() != 0{ // Read page contents
        // In this sample we process only text, but the code can be extended
        // to handle paths, images, or other Element type.
        mcid := element.GetStructMCID()
        
        if mcid >= 0 && element.GetType() == ElementE_text{
            val := element.GetTextString()
            if _, ok := mcidPageMap[mcid]; ok {
                mcidPageMap[mcid] = mcidPageMap[mcid] + val
            }else{
                mcidPageMap[mcid] = val
            }
        }
        element = reader.Next()
    }
}

// Used in code snippet 2.
func ProcessElements(reader ElementReader){
    element := reader.Next()
    for element.GetMp_elem().Swigcptr() != 0{  // Read page contents
        // In this sample we process only paths & text, but the code can be 
        // extended to handle any element type.
        etype := element.GetType()
        if (etype == ElementE_path ||
            etype == ElementE_text ||
            etype == ElementE_path){
            if etype == ElementE_path{      // Process path ...
                os.Stdout.Write([]byte("\nPATH: "))
            }else if etype == ElementE_text{    // Process text ...
                os.Stdout.Write([]byte("\nTEXT: " + element.GetTextString() + "\n"))
            }else if etype == ElementE_path{    // Process from XObjects
                os.Stdout.Write([]byte("\nFORM XObject: "))
            }

            // Check if the element is associated with any structural element.
            // Content items are leaf nodes of the structure tree.
            structParent := element.GetParentStructElement()
            if structParent.IsValid(){
                // Print out the parent structural element's type, title, and object number.
                os.Stdout.Write([]byte(" Type: " + structParent.GetType() + ", MCID: " + strconv.Itoa(element.GetStructMCID())))
                if structParent.HasTitle(){
                    os.Stdout.Write([]byte(". Title: " + structParent.GetTitle()))
                }
                os.Stdout.Write([]byte(", Obj//: " + strconv.Itoa(int(structParent.GetSDFObj().GetObjNum()))))
            }
        }
        element = reader.Next()
    }
}        
        
func ProcessStructElement2(element SElement, mcidDocMap map[int](map[int]string), indent int){
    if !element.IsValid(){
        return
    }
    // Print out the type and title info, if any
    PrintIndent(indent)
    os.Stdout.Write([]byte("<" + element.GetType()))
    if element.HasTitle(){
        os.Stdout.Write([]byte(" title=\"" + element.GetTitle() + "\""))
    }
    os.Stdout.Write([]byte(">"))
    
    num := element.GetNumKids()
    i := 0
    for i < num{
        if element.IsContentItem(i){
            cont := element.GetAsContentItem(i)
            if cont.GetType() == ContentItemE_MCID{
                pageNum := cont.GetPage().GetIndex()
                if _, ok := mcidDocMap[pageNum]; ok{
                    mcidPageMap := mcidDocMap[pageNum]
                    mcidKey := cont.GetMCID()
                    if _, ok := mcidPageMap[mcidKey]; ok{
                        os.Stdout.Write([]byte(mcidPageMap[mcidKey]))
                    }
                }
            }
        }else{ // the kid is another StructElement node.
            ProcessStructElement2(element.GetAsStructElem(i), mcidDocMap, indent+1)     
        } 
        i = i + 1
    }
    PrintIndent(indent)
    os.Stdout.Write([]byte("</" + element.GetType() + ">"))
}        

func main(){
    PDFNetInitialize(PDFTronLicense.Key)
    
    // Relative path to the folder containing the test files.
    inputPath := "../../TestFiles/"
    outputPath := "../../TestFiles/Output/"
    
    // Extract logical structure from a PDF document
    doc := NewPDFDoc(inputPath + "tagged.pdf")
    doc.InitSecurityHandler()
    
    fmt.Println("____________________________________________________________")
    fmt.Println("Sample 1 - Traverse logical structure tree...")
    
    tree := doc.GetStructTree()
    if tree.IsValid(){
        fmt.Println("Document has a StructTree root.")
        
        i := 0
        for i < tree.GetNumKids(){
            // Recursively get structure info for all child elements.
            ProcessStructElement(tree.GetKid(i), 0)
            i = i + 1
        }
    }else{
        fmt.Println("This document does not contain any logical structure.")
    }

    fmt.Println("\nDone 1.")

    fmt.Println("____________________________________________________________")
    fmt.Println("Sample 2 - Get parent logical structure elements from")
    fmt.Println("layout elements.")
    
    reader := NewElementReader()
    itr := doc.GetPageIterator()
    for itr.HasNext(){
        reader.Begin(itr.Current())
        ProcessElements(reader)
        reader.End()
        itr.Next()
    }

    fmt.Println("\nDone 2.")
    
    fmt.Println("____________________________________________________________")
    fmt.Println("Sample 3 - 'XML style' extraction of PDF logical structure and page content.")
    // A map which maps page numbers(as Integers)
    // to page Maps(which map from struct mcid(as Integers) to
    // text Strings)
    var mcidDocMap = make(map[int](map[int]string))
    reader = NewElementReader()
    itr = doc.GetPageIterator()
    for itr.HasNext(){
        reader.Begin(itr.Current())
        var pageMcidMap = make(map[int]string)
        mcidDocMap[itr.Current().GetIndex()] = pageMcidMap
        ProcessElements2(reader, pageMcidMap)
        reader.End()
        itr.Next() 
    } 
    tree = doc.GetStructTree()
    if tree.IsValid(){
        i := 0
        for i < tree.GetNumKids(){
            ProcessStructElement2(tree.GetKid(i), mcidDocMap, 0)
            i = i + 1  
        }
    }
    fmt.Println("\nDone 3.")
    doc.Save(outputPath + "LogicalStructure.pdf", uint(SDFDocE_linearized))
    doc.Close()        
    PDFNetTerminate()
}