Extract Text, Read, Parse PDF - TextExtract - Go Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "strconv"
10 "os"
11 . "pdftron"
12)
13
14import "pdftron/Samples/LicenseKey/GO"
15
16func PrintStyle (style Style){
17 sansSerifStr := ""
18 if style.IsSerif(){
19 sansSerifStr = " sans-serif;"
20 }
21 rgb := style.GetColor()
22 rgbHex := fmt.Sprintf("%02X%02X%02X;", rgb.Get(0), rgb.Get(1), rgb.Get(2))
23 fontStr := fmt.Sprintf("%g", style.GetFontSize())
24 os.Stdout.Write([]byte(" style=\"font-family:" + style.GetFontName() + "; font-size:" + fontStr + ";" + sansSerifStr + " color:#" + rgbHex + "\""))
25}
26
27func DumpAllText (reader ElementReader){
28 element := reader.Next()
29
30 for element.GetMp_elem().Swigcptr() != 0{
31 etype := element.GetType()
32 if etype == ElementE_text_begin{
33 fmt.Println("Text Block Begin")
34 }else if etype == ElementE_text_end{
35 fmt.Println("Text Block End")
36 }else if etype == ElementE_text{
37 bbox := element.GetBBox()
38 fmt.Println("BBox: " + fmt.Sprintf("%f", bbox.GetX1()) + ", " + fmt.Sprintf("%f", bbox.GetY1()) + ", " +
39 fmt.Sprintf("%f", bbox.GetX2()) + ", " + fmt.Sprintf("%f", bbox.GetY2()))
40 textString := element.GetTextString()
41 fmt.Println(textString)
42 }else if etype == ElementE_text_new_line{
43 fmt.Println("New Line")
44 }else if etype == ElementE_form{
45 reader.FormBegin()
46 DumpAllText(reader)
47 reader.End()
48 }
49 element = reader.Next()
50 }
51}
52
53// A utility method used to extract all text content from
54// a given selection rectangle. The recnagle coordinates are
55// expressed in PDF user/page coordinate system.
56func ReadTextFromRect (page Page, pos Rect, reader ElementReader) string{
57 reader.Begin(page)
58 srchStr := RectTextSearch(reader, pos)
59 reader.End()
60 return srchStr
61}
62//A helper method for ReadTextFromRect
63func RectTextSearch (reader ElementReader, pos Rect) string{
64 element := reader.Next()
65 srchStr2 := ""
66 for element.GetMp_elem().Swigcptr() != 0{
67 etype := element.GetType()
68 if etype == ElementE_text{
69 bbox := element.GetBBox()
70 if (bbox.IntersectRect(bbox, pos)){
71 arr := element.GetTextString()
72 srchStr2 += arr
73 srchStr2 += "\n"
74 }
75 }else if etype == ElementE_text_new_line{
76 //handle text new line here
77 }else if etype == ElementE_form{
78 reader.FormBegin()
79 srchStr2 += RectTextSearch(reader, pos)
80 fmt.Println(srchStr2)
81 reader.End()
82 }
83 element = reader.Next()
84 }
85 return srchStr2
86}
87
88func main(){
89 PDFNetInitialize(PDFTronLicense.Key)
90
91 // Relative path to the folder containing test files.
92 inputPath := "../../TestFiles/newsletter.pdf"
93 example1Basic := false
94 example2Xml := false
95 example3Wordlist := false
96 example4Advanced := true
97 example5LowLevel := false
98
99 // Sample code showing how to use high-level text extraction APIs.
100 doc := NewPDFDoc(inputPath)
101 doc.InitSecurityHandler()
102
103 page := doc.GetPage(1)
104 if page == nil{
105 fmt.Println("page no found")
106 }
107 txt := NewTextExtractor()
108 txt.Begin(page) // Read the page
109
110 // Example 1. Get all text on the page in a single string.
111 // Words will be separated witht space or new line characters.
112 if example1Basic{
113 fmt.Println("Word count: " + strconv.Itoa(txt.GetWordCount()))
114 txtAsText := txt.GetAsText()
115 fmt.Println("- GetAsText --------------------------" + txtAsText)
116 fmt.Println("-----------------------------------------------------------")
117 }
118 // Example 2. Get XML logical structure for the page.
119 if example2Xml{
120 text := txt.GetAsXML(TextExtractorE_words_as_elements |
121 TextExtractorE_output_bbox |
122 TextExtractorE_output_style_info)
123 fmt.Println("- GetAsXML --------------------------" + text)
124 fmt.Println("-----------------------------------------------------------")
125 }
126 // Example 3. Extract words one by one.
127 if example3Wordlist{
128 word := NewWord()
129 line := txt.GetFirstLine()
130 for line.IsValid(){
131 word = line.GetFirstWord()
132 for word.IsValid(){
133 wordString := word.GetString()
134 fmt.Println(wordString)
135 word = word.GetNextWord()
136 }
137 line = line.GetNextLine()
138 }
139 fmt.Println("-----------------------------------------------------------")
140 }
141 // Example 4. A more advanced text extraction example.
142 // The output is XML structure containing paragraphs, lines, words,
143 // as well as style and positioning information.
144 if example4Advanced{
145 bbox := NewRect()
146 curFlowId := -1
147 curParaId := -1
148
149 fmt.Println("<PDFText>")
150 // For each line on the page...
151 line := txt.GetFirstLine()
152 for line.IsValid(){
153 if line.GetNumWords() == 0{
154 line = line.GetNextLine()
155 continue
156 }
157 word := line.GetFirstWord()
158 if curFlowId != line.GetFlowID(){
159 if curFlowId != -1{
160 if curParaId != -1{
161 curParaId = -1
162 fmt.Println("</Para>")
163 }
164 fmt.Println("</Flow>")
165 }
166 curFlowId = line.GetFlowID()
167 fmt.Println("<Flow id=\"" + strconv.Itoa(curFlowId) +"\">")
168 }
169 if curParaId != line.GetParagraphID(){
170 if curParaId != -1{
171 fmt.Println("</Para>")
172 }
173 curParaId= line.GetParagraphID()
174 fmt.Println("<Para id=\"" +strconv.Itoa(curParaId)+ "\">")
175 }
176 bbox = line.GetBBox()
177 lineStyle := line.GetStyle()
178 os.Stdout.Write([]byte(fmt.Sprintf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2())))
179 PrintStyle (lineStyle)
180 os.Stdout.Write([]byte(" cur_num=\"" + strconv.Itoa(line.GetCurrentNum()) + "\"" + ">\n"))
181
182 // For each word in the line...
183 word = line.GetFirstWord()
184 for word.IsValid(){
185 // Output the bounding box for the word
186 bbox = word.GetBBox()
187 os.Stdout.Write([]byte(fmt.Sprintf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2())))
188 os.Stdout.Write([]byte(" cur_num=\"" + strconv.Itoa(word.GetCurrentNum()) + "\""));
189 sz := word.GetStringLen()
190 if sz == 0{
191 word = word.GetNextWord()
192 continue
193 }
194 // If the word style is different from the parent style, output the new style.
195 s := word.GetStyle()
196 if !s.IsEqual(lineStyle){
197 PrintStyle (s)
198 }
199 wordString := word.GetString()
200 os.Stdout.Write([]byte(">" + wordString + "</Word>\n"))
201 word = word.GetNextWord()
202 }
203 os.Stdout.Write([]byte("</Line>\n"))
204 line = line.GetNextLine()
205 }
206 if curFlowId != -1{
207 if curParaId != -1{
208 curParaId = -1
209 os.Stdout.Write([]byte("</Para>\n"))
210 }
211 os.Stdout.Write([]byte("</Flow>\n"))
212 }
213 txt.Destroy()
214 doc.Close()
215 fmt.Println("</PDFText>")
216 }
217 // Sample code showing how to use low-level text extraction APIs.
218 if example5LowLevel{
219 doc = NewPDFDoc(inputPath)
220 doc.InitSecurityHandler()
221
222 // Example 1. Extract all text content from the document
223
224 reader := NewElementReader()
225 itr := doc.GetPageIterator()
226 for itr.HasNext(){
227 reader.Begin(itr.Current())
228 DumpAllText(reader)
229 reader.End()
230 itr.Next()
231 }
232
233 // Example 2. Extract text content based on the
234 // selection rectangle.
235
236 fmt.Println("----------------------------------------------------")
237 fmt.Println("Extract text based on the selection rectangle.")
238 fmt.Println("----------------------------------------------------")
239
240 itr = doc.GetPageIterator()
241 firstPage := itr.Current()
242 s1 := ReadTextFromRect(firstPage, NewRect(27.0, 392.0, 563.0, 534.0), reader)
243 fmt.Println("Field 1: " + s1)
244
245 s1 = ReadTextFromRect(firstPage, NewRect(28.0, 551.0, 106.0, 623.0), reader);
246 fmt.Println("Field 2: " + s1)
247
248 s1 = ReadTextFromRect(firstPage, NewRect(208.0, 550.0, 387.0, 621.0), reader);
249 fmt.Println("Field 3: " + s1)
250
251 doc.Close()
252 PDFNetTerminate()
253 fmt.Println("Done.")
254 }
255}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales