Some test text!

Discord Logo

Chat with us

PDFTron is now Apryse, learn more here.

PDF data extraction in Go (images, text, paths)

More languages

More languages
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample Go code for using PDFTron SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our Go PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
// Consult LICENSE.txt regarding license information.
//---------------------------------------------------------------------------------------

package main
import (
	"fmt"
    "os"
    "strconv"
	. "pdftron"
)

import  "pdftron/Samples/LicenseKey/GO"

func ProcessPath(reader ElementReader, path Element){
    if path.IsClippingPath(){
        fmt.Println("This is a clipping path")
    }

    pathData := path.GetPathData()
    data := pathData.GetPoints()
    opr := pathData.GetOperators()

    oprIndex := 0
    oprEnd := int(opr.Size())
    dataIndex := 0
    //dataEnd := data.Size()
    
    // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
    
    os.Stdout.Write([]byte("Path Data Points := \""))
    x1, x2, x3, x4 := 0.0, 0.0, 0.0, 0.0
    y1, y2, y3, y4 := 0.0, 0.0, 0.0, 0.0
    for oprIndex < oprEnd{
        if int(opr.Get(oprIndex)) == int(PathDataE_moveto){
            x1 = data.Get(dataIndex) 
            dataIndex = dataIndex + 1
            y1 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            os.Stdout.Write([]byte("M" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
        }else if int(opr.Get(oprIndex)) == int(PathDataE_lineto){
            x1 = data.Get(dataIndex) 
            dataIndex = dataIndex + 1
            y1 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            os.Stdout.Write([]byte(" L" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
        }else if int(opr.Get(oprIndex)) == int(PathDataE_cubicto){
            x1 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            y1 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            x2 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            y2 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            x3 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            y3 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            os.Stdout.Write([]byte(" C" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1) + " " + fmt.Sprintf("%f", x2) + " " + fmt.Sprintf("%f", y2) + " " + fmt.Sprintf("%f", x3) + " " + fmt.Sprintf("%f", y3)))
        }else if int(opr.Get(oprIndex)) == int(PathDataE_rect){
            x1 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            y1 = data.Get(dataIndex)
            dataIndex = dataIndex + 1
            w := data.Get(dataIndex)
            dataIndex = dataIndex + 1
            h := data.Get(dataIndex)
            dataIndex = dataIndex + 1
            x2 = x1 + w
            y2 = y1
            x3 = x2
            y3 = y1 + h
            x4 = x1
            y4 = y3
            os.Stdout.Write([]byte("M" + fmt.Sprintf("%.2f", x1) + " " + fmt.Sprintf("%.2f", y1) + " L" + fmt.Sprintf("%.2f", x2) + " " + fmt.Sprintf("%.2f", y2) + " L" + fmt.Sprintf("%.2f", x3) + " " + fmt.Sprintf("%.2f", y3) + " L" + fmt.Sprintf("%.2f", x4) + " " + fmt.Sprintf("%.2f", y4) + " Z"))
        }else if int(opr.Get(oprIndex)) == int(PathDataE_closepath){
            fmt.Println(" Close Path")
        }else{
            //
        }
        oprIndex = oprIndex + 1
    }

    os.Stdout.Write([]byte("\" "))
    gs := path.GetGState()
    
    // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
    if path.IsStroked(){
        fmt.Println("Stroke path")
        
        if (gs.GetStrokeColorSpace().GetType() == ColorSpaceE_pattern){
            fmt.Println("Path has associated pattern")
        }else{
            // Get stroke color (you can use PDFNet color conversion facilities)
            // rgb = gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor())
        }
    }else{
        // Do not stroke path
    }

    if path.IsFilled(){
        fmt.Println("Fill path")
        
        if (gs.GetFillColorSpace().GetType() == ColorSpaceE_pattern){
            fmt.Println("Path has associated pattern")
        }else{
            // rgb = gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor())
        }
    }else{
        // Do not fill path
    }

    // Process any changes in graphics state  ---------------------------------
    gsItr := reader.GetChangesIterator()
    for gsItr.HasNext(){
        if int(gsItr.Current()) == int(GStateE_transform){
            // Get transform matrix for this element. Unlike path.GetCTM() 
            // that return full transformation matrix gs.GetTransform() return 
            // only the transformation matrix that was installed for this element.
            //
            // gs.GetTransform()
            
        }else if int(gsItr.Current()) == int(GStateE_line_width){
            // gs.GetLineWidth()
            
        }else if int(gsItr.Current()) == int(GStateE_line_cap){
            // gs.GetLineCap()
            
        }else if int(gsItr.Current()) == int(GStateE_line_join){
            // gs.GetLineJoin()
            
        }else if int(gsItr.Current()) == int(GStateE_flatness){
            
        }else if int(gsItr.Current()) == int(GStateE_miter_limit){
            // gs.GetMiterLimit()
            
        }else if int(gsItr.Current()) == int(GStateE_dash_pattern){
            // dashes = gs.GetDashes()
            // gs.GetPhase()
            
        }else if int(gsItr.Current()) == int(GStateE_fill_color){
            if (int(gs.GetFillColorSpace().GetType()) == int(ColorSpaceE_pattern) && int(gs.GetFillPattern().GetType()) != int(PatternColorE_shading) ){
                // process the pattern data
                reader.PatternBegin(true)
                ProcessElements(reader)
                reader.End()
            }
        }
        gsItr.Next()
    }
    reader.ClearChangeList()
}

func ProcessText (pageReader ElementReader){
    // Begin text element
    fmt.Println("Begin Text Block:")
    
    element := pageReader.Next()
    
    for element.GetMp_elem().Swigcptr() != 0{
        etype := element.GetType()
        if etype == ElementE_text_end{
            // Finish the text block
            fmt.Println("End Text Block.")
            return
        }else if etype == ElementE_text{
            gs := element.GetGState()
            
            //csFill := gs.GetFillColorSpace()
            //fill := gs.GetFillColor()
            
            //out := csFill.Convert2RGB(fill)
            
            //csStroke := gs.GetStrokeColorSpace()
            //stroke := gs.GetStrokeColor()
            
            font := gs.GetFont()
            fmt.Println("Font Name: " + font.GetName())
            // font.IsFixedWidth()
            // font.IsSerif()
            // font.IsSymbolic()
            // font.IsItalic()
            // ... 

            // fontSize = gs.GetFontSize()
            // wordSpacing = gs.GetWordSpacing()
            // charSpacing = gs.GetCharSpacing()
            // txt := element.GetTextString()
            if font.GetType() == FontE_Type3{
                // type 3 font, process its data
                itr := element.GetCharIterator()
                for itr.HasNext(){
                    pageReader.Type3FontBegin(itr.Current())
                    ProcessElements(pageReader)
                    pageReader.End()
                }
            }else{
                text_mtx := element.GetTextMatrix()
                
                itr := element.GetCharIterator()
                for itr.HasNext(){
                    charCode := itr.Current().GetChar_data()
                    if *charCode >= 32 && *charCode <= 255 {     // Print if in ASCII range...
                        a := font.MapToUnicode(uint(*charCode))
                        os.Stdout.Write([]byte( a )) // Revisit: if sys.version_info.major < 3 else ascii(a[0]) ))
                    }    
                    pt := NewPoint()   
                    pt.SetX(itr.Current().GetX())     // character positioning information
                    pt.SetY(itr.Current().GetY())
                    
                    // Use element.GetCTM() if you are interested in the CTM 
                    // (current transformation matrix).
                    ctm := element.GetCTM()
                    
                    // To get the exact character positioning information you need to 
                    // concatenate current text matrix with CTM and then multiply 
                    // relative positioning coordinates with the resulting matrix.
                    mtx := ctm.Multiply(text_mtx)
                    mtx.Mult(pt)
                    itr.Next()
                }
            }
            fmt.Println("")
        }
        element = pageReader.Next()
    }
}

func ProcessImage (image Element){
    //imageMask := image.IsImageMask()
    //interpolate := image.IsImageInterpolate()
    width := image.GetImageWidth()
    height := image.GetImageHeight()
    outDataSz := width * height * 3
    
    fmt.Println("Image: width=\"" + fmt.Sprintf("%d", width) + "\"" + " height=\"" + fmt.Sprintf("%d", height)+ "\"" )
    
    // Matrix2D& mtx = image->GetCTM() // image matrix (page positioning info)

    // You can use GetImageData to read the raw (decoded) image data
    //image->GetBitsPerComponent()    
    //image->GetImageData()    // get raw image data
    // .... or use Image2RGB filter that converts every image to RGB format,
    // This should save you time since you don't need to deal with color conversions, 
    // image up-sampling, decoding etc.
    
    imgConv := NewImage2RGB(image)     // Extract and convert image to RGB 8-bps format
    reader := NewFilterReader(imgConv)

    //imageDataOut := reader.Read(int64(outDataSz))
    reader.Read(int64(outDataSz))
    
    // Note that you don't need to read a whole image at a time. Alternatively
    // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
    // until the function returns 0. 
}

func ProcessElements(reader ElementReader){
    element := reader.Next()     // Read page contents
    for element.GetMp_elem().Swigcptr() != 0{
        etype := element.GetType()
        if etype == ElementE_path{      // Process path data...
            ProcessPath(reader, element)
        }else if etype == ElementE_text_begin{      // Process text block...
            ProcessText(reader)
        }else if etype == ElementE_form{    // Process form XObjects
            reader.FormBegin()
            ProcessElements(reader)
            reader.End()
        }else if etype == ElementE_image{    // Process Images
            ProcessImage(element)
        }
        element = reader.Next()
    }
}

func main(){
    PDFNetInitialize(PDFTronLicense.Key)
    
    // Relative path to the folder containing the test files.
    inputPath := "../../TestFiles/"
    //outputPath := "../../TestFiles/Output/"
    
    // Extract text data from all pages in the document
    
    fmt.Println("__________________________________________________")
    fmt.Println("Extract page element information from all ")
    fmt.Println("pages in the document.")
    
    doc := NewPDFDoc(inputPath + "newsletter.pdf")
    doc.InitSecurityHandler()
    //pgnum := doc.GetPageCount()
    pageBegin := doc.GetPageIterator()
    pageReader := NewElementReader()
    
    itr := pageBegin
    for itr.HasNext(){    // Read every page
        fmt.Println("Page " + strconv.Itoa(itr.Current().GetIndex()) + "----------------------------------------")
        pageReader.Begin(itr.Current())
        ProcessElements(pageReader)
        pageReader.End()
        itr.Next()
    }
    doc.Close()
    PDFNetTerminate()
    fmt.Println("Done.")
}