Smart Data Extraction - Go Sample Code

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

To run this sample, you will need to:

  1. Get started with Server SDK in your language/framework
  2. Download the Data Extraction Module

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "testing"
10 "os"
11 "flag"
12 . "github.com/pdftron/pdftron-go/v2"
13)
14
15var licenseKey string
16var modulePath string
17
18func init() {
19 flag.StringVar(&licenseKey, "license", "", "License key for Apryse SDK")
20 flag.StringVar(&modulePath, "modulePath", "", "Path for downloaded modules")
21}
22
23//---------------------------------------------------------------------------------------
24// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
25// extract various types of data from PDF documents.
26//
27// The Apryse SDK Data Extraction suite can be downloaded from
28// https://docs.apryse.com/core/guides/info/modules
29//
30// Please contact us if you have any questions.
31//---------------------------------------------------------------------------------------
32
33// Relative path to the folder containing the test files.
34var inputPath = "../TestFiles/"
35var outputPath = "../TestFiles/Output/"
36
37//---------------------------------------------------------------------------------------
38
39func catch(err *error) {
40 if r := recover(); r != nil {
41 *err = fmt.Errorf("%v", r)
42 }
43}
44
45//---------------------------------------------------------------------------------------
46
47func WriteTextToFile(outputFile string, text string) {
48 f, err := os.Create(outputFile)
49 if err != nil {
50 fmt.Println(err)
51 }
52
53 defer f.Close()
54
55 _, err2 := f.WriteString(text)
56 if err2 != nil {
57 fmt.Println(err2)
58 }
59}
60
61//---------------------------------------------------------------------------------------
62// The following sample illustrates how to extract tables from PDF documents.
63//---------------------------------------------------------------------------------------
64
65func TabularDataTest() (err error) {
66 defer catch(&err)
67
68 PDFNetAddResourceSearchPath(modulePath)
69
70 // Test if the add-on is installed
71 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Tabular) {
72 fmt.Println("")
73 fmt.Println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
74 fmt.Println("-----------------------------------------------------------------------------")
75 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
76 fmt.Println("at https://docs.apryse.com/documentation/core/guides/info/modules. If you have already")
77 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
78 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
79 fmt.Println("")
80 return nil
81 }
82
83 // Extract tabular data as a JSON file
84 fmt.Println("Extract tabular data as a JSON file")
85
86 inputFile := inputPath + "table.pdf"
87 outputFile := outputPath + "table.json"
88 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Tabular)
89
90 fmt.Println("Result saved in " + outputFile)
91
92 // Extract tabular data as a JSON string
93 fmt.Println("Extract tabular data as a JSON string")
94
95 inputFile = inputPath + "financial.pdf"
96 outputFile = outputPath + "financial.json"
97
98 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Tabular).(string)
99 WriteTextToFile(outputFile, json)
100
101 fmt.Println("Result saved in " + outputFile)
102
103 // Extract tabular data as an XLSX file
104 fmt.Println("Extract tabular data as an XLSX file")
105
106 inputFile = inputPath + "table.pdf"
107 outputFile = outputPath + "table.xlsx"
108 DataExtractionModuleExtractToXLSX(inputFile, outputFile)
109
110 fmt.Println("Result saved in " + outputFile)
111
112 // Extract tabular data as an XLSX stream (also known as filter)
113 fmt.Println("Extract tabular data as an XLSX stream")
114
115 inputFile = inputPath + "financial.pdf"
116 outputFile = outputPath + "financial.xlsx"
117 outputXlsxStream := NewMemoryFilter(0, false)
118 outputFilter := NewFilter(outputXlsxStream)
119 options := NewDataExtractionOptions()
120 options.SetPages("1"); // page 1
121 DataExtractionModuleExtractToXLSX(inputFile, outputFilter, options)
122 outputXlsxStream.SetAsInputFilter()
123 outputXlsxStream.WriteToFile(outputFile, false)
124
125 fmt.Println("Result saved in " + outputFile)
126
127 return nil
128}
129
130//---------------------------------------------------------------------------------------
131// The following sample illustrates how to extract document structure from PDF documents.
132//---------------------------------------------------------------------------------------
133
134func DocumentStructureTest() (err error) {
135 defer catch(&err)
136
137 // Test if the add-on is installed
138 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocStructure) {
139 fmt.Println("")
140 fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
141 fmt.Println("-----------------------------------------------------------------------------")
142 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
143 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
144 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
145 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
146 fmt.Println("")
147 return nil
148 }
149
150 // Extract document structure as a JSON file
151 fmt.Println("Extract document structure as a JSON file")
152
153 inputFile := inputPath + "paragraphs_and_tables.pdf"
154 outputFile := outputPath + "paragraphs_and_tables.json"
155 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocStructure)
156
157 fmt.Println("Result saved in " + outputFile)
158
159 // Extract document structure as a JSON string
160 fmt.Println("Extract document structure as a JSON string")
161
162 inputFile = inputPath + "tagged.pdf"
163 outputFile = outputPath + "tagged.json"
164 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocStructure).(string)
165 WriteTextToFile(outputFile, json)
166
167 fmt.Println("Result saved in " + outputFile)
168
169 return nil
170}
171
172//---------------------------------------------------------------------------------------
173// The following sample illustrates how to extract form fields from PDF documents.
174//---------------------------------------------------------------------------------------
175
176func FormFieldsTest() (err error) {
177 defer catch(&err)
178
179 // Test if the add-on is installed
180 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Form) {
181 fmt.Println("")
182 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
183 fmt.Println("-----------------------------------------------------------------------------")
184 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
185 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
186 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
187 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
188 fmt.Println("")
189 return nil
190 }
191
192 // Extract form fields as a JSON file
193 fmt.Println("Extract form fields as a JSON file")
194
195 inputFile := inputPath + "formfields-scanned.pdf"
196 outputFile := outputPath + "formfields-scanned.json"
197 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Form)
198
199 fmt.Println("Result saved in " + outputFile)
200
201 // Extract form fields as a JSON string
202 fmt.Println("Extract form fields as a JSON string")
203
204 inputFile = inputPath + "formfields.pdf"
205 outputFile = outputPath + "formfields.json"
206
207 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Form).(string)
208 WriteTextToFile(outputFile, json)
209
210 fmt.Println("Result saved in " + outputFile)
211
212 //////////////////////////////////////////////////////////////////////////
213 // Detect and add form fields to a PDF document.
214 // PDF document already has form fields, and this sample will update to new found fields.
215 doc := NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
216
217 fmt.Println("Extract form fields as a PDF file, keep new fields")
218 DataExtractionModuleDetectAndAddFormFieldsToPDF(doc)
219
220 outputFile = outputPath + "formfields-scanned-fields-new.pdf"
221 doc.Save(outputFile, uint(SDFDocE_linearized))
222 doc.Close()
223
224 fmt.Println("Result saved in " + outputFile)
225
226 //////////////////////////////////////////////////////////////////////////
227 // Detect and add form fields to a PDF document.
228 // PDF document already has form fields, and this sample will keep the original fields.
229 doc = NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
230
231 // Setup DataExtractionOptions to keep old fields
232 options := NewDataExtractionOptions()
233 options.SetOverlappingFormFieldBehavior("KeepOld")
234
235 fmt.Println("Extract form fields as a PDF file, keep old fields")
236 DataExtractionModuleDetectAndAddFormFieldsToPDF(doc, options)
237
238 outputFile = outputPath + "formfields-scanned-fields-old.pdf"
239 doc.Save(outputFile, uint(SDFDocE_linearized))
240 doc.Close()
241
242 fmt.Println("Result saved in " + outputFile)
243
244 return nil
245}
246
247//---------------------------------------------------------------------------------------
248// The following sample illustrates how to extract key-value pairs from PDF documents.
249//---------------------------------------------------------------------------------------
250
251func GenericKeyValueTest() (err error) {
252 defer catch(&err)
253
254 // Test if the add-on is installed
255 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_GenericKeyValue) {
256 fmt.Println("")
257 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
258 fmt.Println("-----------------------------------------------------------------------------")
259 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
260 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
261 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
262 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
263 fmt.Println("")
264 return nil
265 }
266
267 fmt.Println("Extract key-value pairs from a PDF")
268
269 inputFile := inputPath + "newsletter.pdf"
270 outputFile := outputPath + "newsletter_key_val.json"
271 // Simple example: Extract Keys & Values as a JSON file
272 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue)
273
274 fmt.Println("Result saved in " + outputFile)
275
276 // Example with customized options:
277 // Extract Keys & Values from pages 2-4, excluding ads
278 options := NewDataExtractionOptions()
279 options.SetPages("2-4")
280
281 p2ExclusionZones := NewRectCollection()
282 // Exclude the ad on page 2
283 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
284 // Coordinates rotate with the page, if it has rotation applied.
285 p2ExclusionZones.AddRect(NewRect(166, 47, 562, 222))
286 options.AddExclusionZonesForPage(p2ExclusionZones, 2)
287
288 p4InclusionZones := NewRectCollection()
289 p4ExclusionZones := NewRectCollection()
290 // Only include the article text for page 4, exclude ads and headings
291 p4InclusionZones.AddRect(NewRect(30, 432, 562, 684))
292 p4ExclusionZones.AddRect(NewRect(30, 657, 295, 684))
293 options.AddInclusionZonesForPage(p4InclusionZones, 4)
294 options.AddExclusionZonesForPage(p4ExclusionZones, 4)
295
296 fmt.Println("Extract Key-Value pairs from specific pages and zones as a JSON file")
297 outputFile = outputPath + "newsletter_key_val_with_zones.json"
298 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue, options)
299
300 fmt.Println("Result saved in " + outputFile)
301
302 return nil
303}
304
305//---------------------------------------------------------------------------------------
306
307func TestDataExtraction(t *testing.T) {
308 // The first step in every application using PDFNet is to initialize the
309 // library. The library is usually initialized only once, but calling
310 // Initialize() multiple times is also fine.
311 PDFNetInitialize(licenseKey)
312
313 //-----------------------------------------------------------------------------------
314
315 PDFNetAddResourceSearchPath("../../../PDFNetC/Lib/")
316
317 //-----------------------------------------------------------------------------------
318
319 err := TabularDataTest()
320 if err != nil {
321 fmt.Println(fmt.Errorf("Unable to extract tabular data, error: %s", err))
322 }
323
324 //-----------------------------------------------------------------------------------
325
326 err = DocumentStructureTest()
327 if err != nil {
328 fmt.Println(fmt.Errorf("Unable to extract document structure data, error: %s", err))
329 }
330
331 //-----------------------------------------------------------------------------------
332
333 err = FormFieldsTest()
334 if err != nil {
335 fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
336 }
337
338 err = GenericKeyValueTest()
339 if err != nil {
340 fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
341 }
342
343 //-----------------------------------------------------------------------------------
344
345 PDFNetTerminate()
346 fmt.Println("Done.")
347}
348

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales