PDF Data Extraction - Images, Text, Paths - Go Sample Code

Sample code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "os"
10 "strconv"
11 . "pdftron"
12)
13
14import "pdftron/Samples/LicenseKey/GO"
15
16func ProcessPath(reader ElementReader, path Element){
17 if path.IsClippingPath(){
18 fmt.Println("This is a clipping path")
19 }
20
21 pathData := path.GetPathData()
22 data := pathData.GetPoints()
23 opr := pathData.GetOperators()
24
25 oprIndex := 0
26 oprEnd := int(opr.Size())
27 dataIndex := 0
28 //dataEnd := data.Size()
29
30 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
31
32 os.Stdout.Write([]byte("Path Data Points := \""))
33 x1, x2, x3, x4 := 0.0, 0.0, 0.0, 0.0
34 y1, y2, y3, y4 := 0.0, 0.0, 0.0, 0.0
35 for oprIndex < oprEnd{
36 if int(opr.Get(oprIndex)) == int(PathDataE_moveto){
37 x1 = data.Get(dataIndex)
38 dataIndex = dataIndex + 1
39 y1 = data.Get(dataIndex)
40 dataIndex = dataIndex + 1
41 os.Stdout.Write([]byte("M" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
42 }else if int(opr.Get(oprIndex)) == int(PathDataE_lineto){
43 x1 = data.Get(dataIndex)
44 dataIndex = dataIndex + 1
45 y1 = data.Get(dataIndex)
46 dataIndex = dataIndex + 1
47 os.Stdout.Write([]byte(" L" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
48 }else if int(opr.Get(oprIndex)) == int(PathDataE_cubicto){
49 x1 = data.Get(dataIndex)
50 dataIndex = dataIndex + 1
51 y1 = data.Get(dataIndex)
52 dataIndex = dataIndex + 1
53 x2 = data.Get(dataIndex)
54 dataIndex = dataIndex + 1
55 y2 = data.Get(dataIndex)
56 dataIndex = dataIndex + 1
57 x3 = data.Get(dataIndex)
58 dataIndex = dataIndex + 1
59 y3 = data.Get(dataIndex)
60 dataIndex = dataIndex + 1
61 os.Stdout.Write([]byte(" C" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1) + " " + fmt.Sprintf("%f", x2) + " " + fmt.Sprintf("%f", y2) + " " + fmt.Sprintf("%f", x3) + " " + fmt.Sprintf("%f", y3)))
62 }else if int(opr.Get(oprIndex)) == int(PathDataE_rect){
63 x1 = data.Get(dataIndex)
64 dataIndex = dataIndex + 1
65 y1 = data.Get(dataIndex)
66 dataIndex = dataIndex + 1
67 w := data.Get(dataIndex)
68 dataIndex = dataIndex + 1
69 h := data.Get(dataIndex)
70 dataIndex = dataIndex + 1
71 x2 = x1 + w
72 y2 = y1
73 x3 = x2
74 y3 = y1 + h
75 x4 = x1
76 y4 = y3
77 os.Stdout.Write([]byte("M" + fmt.Sprintf("%.2f", x1) + " " + fmt.Sprintf("%.2f", y1) + " L" + fmt.Sprintf("%.2f", x2) + " " + fmt.Sprintf("%.2f", y2) + " L" + fmt.Sprintf("%.2f", x3) + " " + fmt.Sprintf("%.2f", y3) + " L" + fmt.Sprintf("%.2f", x4) + " " + fmt.Sprintf("%.2f", y4) + " Z"))
78 }else if int(opr.Get(oprIndex)) == int(PathDataE_closepath){
79 fmt.Println(" Close Path")
80 }else{
81 //
82 }
83 oprIndex = oprIndex + 1
84 }
85
86 os.Stdout.Write([]byte("\" "))
87 gs := path.GetGState()
88
89 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
90 if path.IsStroked(){
91 fmt.Println("Stroke path")
92
93 if (gs.GetStrokeColorSpace().GetType() == ColorSpaceE_pattern){
94 fmt.Println("Path has associated pattern")
95 }else{
96 // Get stroke color (you can use PDFNet color conversion facilities)
97 // rgb = gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor())
98 }
99 }else{
100 // Do not stroke path
101 }
102
103 if path.IsFilled(){
104 fmt.Println("Fill path")
105
106 if (gs.GetFillColorSpace().GetType() == ColorSpaceE_pattern){
107 fmt.Println("Path has associated pattern")
108 }else{
109 // rgb = gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor())
110 }
111 }else{
112 // Do not fill path
113 }
114
115 // Process any changes in graphics state ---------------------------------
116 gsItr := reader.GetChangesIterator()
117 for gsItr.HasNext(){
118 if int(gsItr.Current()) == int(GStateE_transform){
119 // Get transform matrix for this element. Unlike path.GetCTM()
120 // that return full transformation matrix gs.GetTransform() return
121 // only the transformation matrix that was installed for this element.
122 //
123 // gs.GetTransform()
124
125 }else if int(gsItr.Current()) == int(GStateE_line_width){
126 // gs.GetLineWidth()
127
128 }else if int(gsItr.Current()) == int(GStateE_line_cap){
129 // gs.GetLineCap()
130
131 }else if int(gsItr.Current()) == int(GStateE_line_join){
132 // gs.GetLineJoin()
133
134 }else if int(gsItr.Current()) == int(GStateE_flatness){
135
136 }else if int(gsItr.Current()) == int(GStateE_miter_limit){
137 // gs.GetMiterLimit()
138
139 }else if int(gsItr.Current()) == int(GStateE_dash_pattern){
140 // dashes = gs.GetDashes()
141 // gs.GetPhase()
142
143 }else if int(gsItr.Current()) == int(GStateE_fill_color){
144 if (int(gs.GetFillColorSpace().GetType()) == int(ColorSpaceE_pattern) && int(gs.GetFillPattern().GetType()) != int(PatternColorE_shading) ){
145 // process the pattern data
146 reader.PatternBegin(true)
147 ProcessElements(reader)
148 reader.End()
149 }
150 }
151 gsItr.Next()
152 }
153 reader.ClearChangeList()
154}
155
156func ProcessText (pageReader ElementReader){
157 // Begin text element
158 fmt.Println("Begin Text Block:")
159
160 element := pageReader.Next()
161
162 for element.GetMp_elem().Swigcptr() != 0{
163 etype := element.GetType()
164 if etype == ElementE_text_end{
165 // Finish the text block
166 fmt.Println("End Text Block.")
167 return
168 }else if etype == ElementE_text{
169 gs := element.GetGState()
170
171 //csFill := gs.GetFillColorSpace()
172 //fill := gs.GetFillColor()
173
174 //out := csFill.Convert2RGB(fill)
175
176 //csStroke := gs.GetStrokeColorSpace()
177 //stroke := gs.GetStrokeColor()
178
179 font := gs.GetFont()
180 fmt.Println("Font Name: " + font.GetName())
181 // font.IsFixedWidth()
182 // font.IsSerif()
183 // font.IsSymbolic()
184 // font.IsItalic()
185 // ...
186
187 // fontSize = gs.GetFontSize()
188 // wordSpacing = gs.GetWordSpacing()
189 // charSpacing = gs.GetCharSpacing()
190 // txt := element.GetTextString()
191 if font.GetType() == FontE_Type3{
192 // type 3 font, process its data
193 itr := element.GetCharIterator()
194 for itr.HasNext(){
195 pageReader.Type3FontBegin(itr.Current())
196 ProcessElements(pageReader)
197 pageReader.End()
198 }
199 }else{
200 text_mtx := element.GetTextMatrix()
201
202 itr := element.GetCharIterator()
203 for itr.HasNext(){
204 charCode := itr.Current().GetChar_data()
205 if *charCode >= 32 && *charCode <= 255 { // Print if in ASCII range...
206 a := font.MapToUnicode(uint(*charCode))
207 os.Stdout.Write([]byte( a )) // Revisit: if sys.version_info.major < 3 else ascii(a[0]) ))
208 }
209 pt := NewPoint()
210 pt.SetX(itr.Current().GetX()) // character positioning information
211 pt.SetY(itr.Current().GetY())
212
213 // Use element.GetCTM() if you are interested in the CTM
214 // (current transformation matrix).
215 ctm := element.GetCTM()
216
217 // To get the exact character positioning information you need to
218 // concatenate current text matrix with CTM and then multiply
219 // relative positioning coordinates with the resulting matrix.
220 mtx := ctm.Multiply(text_mtx)
221 mtx.Mult(pt)
222 itr.Next()
223 }
224 }
225 fmt.Println("")
226 }
227 element = pageReader.Next()
228 }
229}
230
231func ProcessImage (image Element){
232 //imageMask := image.IsImageMask()
233 //interpolate := image.IsImageInterpolate()
234 width := image.GetImageWidth()
235 height := image.GetImageHeight()
236 outDataSz := width * height * 3
237
238 fmt.Println("Image: width=\"" + fmt.Sprintf("%d", width) + "\"" + " height=\"" + fmt.Sprintf("%d", height)+ "\"" )
239
240 // Matrix2D& mtx = image->GetCTM() // image matrix (page positioning info)
241
242 // You can use GetImageData to read the raw (decoded) image data
243 //image->GetBitsPerComponent()
244 //image->GetImageData() // get raw image data
245 // .... or use Image2RGB filter that converts every image to RGB format,
246 // This should save you time since you don't need to deal with color conversions,
247 // image up-sampling, decoding etc.
248
249 imgConv := NewImage2RGB(image) // Extract and convert image to RGB 8-bps format
250 reader := NewFilterReader(imgConv)
251
252 //imageDataOut := reader.Read(int64(outDataSz))
253 reader.Read(int64(outDataSz))
254
255 // Note that you don't need to read a whole image at a time. Alternatively
256 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
257 // until the function returns 0.
258}
259
260func ProcessElements(reader ElementReader){
261 element := reader.Next() // Read page contents
262 for element.GetMp_elem().Swigcptr() != 0{
263 etype := element.GetType()
264 if etype == ElementE_path{ // Process path data...
265 ProcessPath(reader, element)
266 }else if etype == ElementE_text_begin{ // Process text block...
267 ProcessText(reader)
268 }else if etype == ElementE_form{ // Process form XObjects
269 reader.FormBegin()
270 ProcessElements(reader)
271 reader.End()
272 }else if etype == ElementE_image{ // Process Images
273 ProcessImage(element)
274 }
275 element = reader.Next()
276 }
277}
278
279func main(){
280 PDFNetInitialize(PDFTronLicense.Key)
281
282 // Relative path to the folder containing the test files.
283 inputPath := "../../TestFiles/"
284 //outputPath := "../../TestFiles/Output/"
285
286 // Extract text data from all pages in the document
287
288 fmt.Println("__________________________________________________")
289 fmt.Println("Extract page element information from all ")
290 fmt.Println("pages in the document.")
291
292 doc := NewPDFDoc(inputPath + "newsletter.pdf")
293 doc.InitSecurityHandler()
294 //pgnum := doc.GetPageCount()
295 pageBegin := doc.GetPageIterator()
296 pageReader := NewElementReader()
297
298 itr := pageBegin
299 for itr.HasNext(){ // Read every page
300 fmt.Println("Page " + strconv.Itoa(itr.Current().GetIndex()) + "----------------------------------------")
301 pageReader.Begin(itr.Current())
302 ProcessElements(pageReader)
303 pageReader.End()
304 itr.Next()
305 }
306 doc.Close()
307 PDFNetTerminate()
308 fmt.Println("Done.")
309}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales