Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.
Learn more about our Server SDK.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7
8using pdftron;
9using pdftron.Common;
10using pdftron.PDF;
11using pdftron.SDF;
12using pdftron.Filters;
13
14namespace DataExtractionTestCS
15{
16 /// <summary>
17 ///---------------------------------------------------------------------------------------
18 /// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
19 /// extract various types of data from PDF documents.
20 ///
21 /// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
22 //---------------------------------------------------------------------------------------
23 /// </summary>
24 class Class1
25 {
26 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
27 static Class1() { }
28
29 // Relative path to the folder containing test files.
30 static string input_path = "../../../../TestFiles/";
31 static string output_path = "../../../../TestFiles/Output/";
32
33
34 /// <summary>
35 /// The following sample illustrates how to extract tables from PDF documents.
36 /// </summary>
37 static void TestTabularData()
38 {
39 // Test if the add-on is installed
40 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
41 {
42 Console.WriteLine();
43 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
46 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine();
50 return;
51 }
52
53 try
54 {
55 // Extract tabular data as a JSON file
56 DataExtractionModule.ExtractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
57
58 // Extract tabular data as a JSON string
59 string json = DataExtractionModule.ExtractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
60 System.IO.File.WriteAllText(output_path + "financial.json", json);
61
62 // Extract tabular data as an XLSX file
63 DataExtractionModule.ExtractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
64
65 // Extract tabular data as an XLSX stream (also known as filter)
66 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
67 DataExtractionModule.ExtractToXLSX(input_path + "financial.pdf", output_xlsx_stream);
68 output_xlsx_stream.SetAsInputFilter();
69 output_xlsx_stream.WriteToFile(output_path + "financial.xlsx", false);
70 }
71 catch (PDFNetException e)
72 {
73 Console.WriteLine(e.Message);
74 }
75 }
76
77
78 /// <summary>
79 // The following sample illustrates how to extract document structure from PDF documents.
80 /// </summary>
81 static void TestDocumentStructure()
82 {
83 // Test if the add-on is installed
84 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
85 {
86 Console.WriteLine();
87 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
88 Console.WriteLine("---------------------------------------------------------------");
89 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
90 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
91 Console.WriteLine("module, ensure that the SDK is able to find the required files");
92 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
93 Console.WriteLine();
94 return;
95 }
96
97 try
98 {
99 // Extract document structure as a JSON file
100 DataExtractionModule.ExtractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
101
102 // Extract document structure as a JSON string
103 string json = DataExtractionModule.ExtractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
104 System.IO.File.WriteAllText(output_path + "tagged.json", json);
105 }
106 catch (PDFNetException e)
107 {
108 Console.WriteLine(e.Message);
109 }
110 }
111
112
113 /// <summary>
114 // The following sample illustrates how to extract form fields from PDF documents.
115 /// </summary>
116 static void TestFormFields()
117 {
118 // Test if the add-on is installed
119 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
120 {
121 Console.WriteLine();
122 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
123 Console.WriteLine("---------------------------------------------------------------");
124 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
125 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
126 Console.WriteLine("module, ensure that the SDK is able to find the required files");
127 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
128 Console.WriteLine();
129 return;
130 }
131
132 try
133 {
134 // Extract form fields as a JSON file
135 DataExtractionModule.ExtractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
136
137 // Extract form fields as a JSON string
138 string json = DataExtractionModule.ExtractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
139 System.IO.File.WriteAllText(output_path + "formfields.json", json);
140
141 // Detect and add form fields to a PDF document.
142 // PDF document already has form fields, and this sample will update to new found fields.
143 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
144 {
145 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc);
146 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveOptions.e_linearized);
147 }
148
149 // Detect and add form fields to a PDF document.
150 // PDF document already has form fields, and this sample will keep the original fields.
151 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
152 {
153 DataExtractionOptions options = new DataExtractionOptions();
154 options.SetOverlappingFormFieldBehavior("KeepOld");
155
156 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options);
157 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveOptions.e_linearized);
158 }
159 }
160 catch (PDFNetException e)
161 {
162 Console.WriteLine(e.Message);
163 }
164 }
165
166 /// <summary>
167 // The following sample illustrates how to extract document structure from PDF documents.
168 /// </summary>
169 static void TestGenericKeyValue()
170 {
171 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value))
172 {
173 Console.WriteLine();
174 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
175 Console.WriteLine("---------------------------------------------------------------");
176 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
177 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
178 Console.WriteLine("module, ensure that the SDK is able to find the required files");
179 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
180 Console.WriteLine();
181 return;
182 }
183
184 try
185 {
186 // Simple example: Extract Keys & Values as a JSON file
187 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
188
189 // Example with customized options:
190 // Extract Keys & Values from pages 2-4, excluding ads
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.SetPages("2-4");
193
194 RectCollection p2ExclusionZones = new RectCollection();
195 // Exclude the ad on page 2
196 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
197 // Coordinates rotate with the page, if it has rotation applied.
198 p2ExclusionZones.AddRect(166, 47, 562, 222);
199 options.AddExclusionZonesForPage(p2ExclusionZones, 2);
200
201 RectCollection p4InclusionZones = new RectCollection();
202 RectCollection p4ExclusionZones = new RectCollection();
203 // Only include the article text for page 4, exclude ads and headings
204 p4InclusionZones.AddRect(30, 432, 562, 684);
205 p4ExclusionZones.AddRect(30, 657, 295, 684);
206 options.AddInclusionZonesForPage(p4InclusionZones, 4);
207 options.AddExclusionZonesForPage(p4ExclusionZones, 4);
208
209 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
210 }
211 catch (PDFNetException e)
212 {
213 Console.WriteLine(e.Message);
214 }
215 }
216
217
218
219 /// <summary>
220 /// The main entry point for the application.
221 /// </summary>
222 static void Main(string[] args)
223 {
224 // The first step in every application using PDFNet is to initialize the
225 // library and set the path to common PDF resources. The library is usually
226 // initialized only once, but calling Initialize() multiple times is also fine.
227 PDFNet.Initialize(PDFTronLicense.Key);
228 PDFNet.AddResourceSearchPath("../../../../../Lib/");
229
230 TestTabularData();
231 TestDocumentStructure();
232 TestFormFields();
233 TestGenericKeyValue();
234
235 PDFNet.Terminate();
236 }
237 }
238}
239
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30 ofstream out_file(filename.c_str(), ofstream::binary);
31 string out_buf = text.ConvertToUtf8();
32 out_file.write(out_buf.c_str(), out_buf.size());
33 out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45 // Test if the add-on is installed
46 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47 {
48 cout << endl;
49 cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50 cout << "---------------------------------------------------------------" << endl;
51 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
53 cout << "module, ensure that the SDK is able to find the required files" << endl;
54 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55 return;
56 }
57
58 // Extract tabular data as a JSON file
59 DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61 // Extract tabular data as a JSON string
62 UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63 WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65 // Extract tabular data as an XLSX file
66 DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68 // Extract tabular data as an XLSX stream (also known as filter)
69 MemoryFilter output_xlsx_stream(0, false);
70 DataExtractionOptions options;
71 options.SetPages("1"); // extract page 1
72 DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73 output_xlsx_stream.SetAsInputFilter();
74 output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82 // Test if the add-on is installed
83 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84 {
85 cout << endl;
86 cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87 cout << "---------------------------------------------------------------" << endl;
88 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
90 cout << "module, ensure that the SDK is able to find the required files" << endl;
91 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92 return;
93 }
94
95 // Extract document structure as a JSON file
96 DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98 // Extract document structure as a JSON string
99 UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100 WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108 // Test if the add-on is installed
109 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110 {
111 cout << endl;
112 cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113 cout << "---------------------------------------------------------------" << endl;
114 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
116 cout << "module, ensure that the SDK is able to find the required files" << endl;
117 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118 return;
119 }
120
121 // Extract form fields as a JSON file
122 DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124 // Extract form fields as a JSON string
125 UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126 WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128 //---------------------------------------------------------------------------------------
129 // Detect and add form fields to a PDF document.
130 // PDF document already has form fields, and this sample will update to new found fields.
131 //---------------------------------------------------------------------------------------
132 {
133 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137 // Save the modfied pdf document
138 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139 }
140
141 //---------------------------------------------------------------------------------------
142 // Detect and add form fields to a PDF document.
143 // PDF document already has form fields, and this sample will keep the original fields.
144 //---------------------------------------------------------------------------------------
145 {
146 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148 // Setup DataExtractionOptions to keep old fields
149 DataExtractionOptions options;
150 options.SetOverlappingFormFieldBehavior("KeepOld");
151
152 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154 // Save the modfied pdf document
155 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156 }
157}
158
159//---------------------------------------------------------------------------------------
160// The following sample illustrates how to extract key-value pairs from PDF documents.
161//---------------------------------------------------------------------------------------
162void TestGenericKeyValue() {
163
164 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue))
165 {
166 cout << endl;
167 cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
168 cout << "---------------------------------------------------------------" << endl;
169 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
170 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
171 cout << "module, ensure that the SDK is able to find the required files" << endl;
172 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
173 return;
174 }
175
176 // Simple example: Extract Keys & Values as a JSON file
177 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val.json"), DataExtractionModule::e_GenericKeyValue);
178
179 // Example with customized options:
180 // Extract Keys & Values from pages 2-4, excluding ads
181 DataExtractionOptions options;
182 options.SetPages("2-4");
183 RectCollection p2_exclusion_zones;
184 // Exclude the ad on page 2
185 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
186 // Coordinates rotate with the page, if it has rotation applied.
187 p2_exclusion_zones.AddRect(166, 47, 562, 222);
188 options.AddExclusionZonesForPage(p2_exclusion_zones, 2);
189
190 RectCollection p4_inclusion_zones, p4_exclusion_zones;
191 // Only include the article text for page 4, exclude ads and headings
192 p4_inclusion_zones.AddRect(30, 432, 562, 684);
193 p4_exclusion_zones.AddRect(30, 657, 295, 684);
194 options.AddInclusionZonesForPage(p4_inclusion_zones, 4);
195 options.AddExclusionZonesForPage(p4_exclusion_zones, 4);
196
197 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val_with_zones.json"), DataExtractionModule::e_GenericKeyValue, &options);
198}
199
200int main(int argc, char* argv[])
201{
202 // The first step in every application using PDFNet is to initialize the
203 // library and set the path to common PDF resources. The library is usually
204 // initialized only once, but calling Initialize() multiple times is also fine.
205 PDFNet::Initialize(LicenseKey);
206
207 int ret = 0;
208
209 try
210 {
211 PDFNet::AddResourceSearchPath("../../../Lib/");
212
213 TestTabularData();
214 TestDocumentStructure();
215 TestFormFields();
216 TestGenericKeyValue();
217 }
218 catch (Common::Exception& e)
219 {
220 cout << e << endl;
221 ret = 1;
222 }
223 catch (...)
224 {
225 cout << "Unknown Exception" << endl;
226 ret = 1;
227 }
228
229 PDFNet::Terminate();
230
231 return ret;
232}
233
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "testing"
10 "os"
11 "flag"
12 . "github.com/pdftron/pdftron-go/v2"
13)
14
15var licenseKey string
16var modulePath string
17
18func init() {
19 flag.StringVar(&licenseKey, "license", "", "License key for Apryse SDK")
20 flag.StringVar(&modulePath, "modulePath", "", "Path for downloaded modules")
21}
22
23//---------------------------------------------------------------------------------------
24// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
25// extract various types of data from PDF documents.
26//
27// The PDFTron SDK Data Extraction suite can be downloaded from
28// https://docs.apryse.com/documentation/core/info/modules/
29//
30// Please contact us if you have any questions.
31//---------------------------------------------------------------------------------------
32
33// Relative path to the folder containing the test files.
34var inputPath = "../TestFiles/"
35var outputPath = "../TestFiles/Output/"
36
37//---------------------------------------------------------------------------------------
38
39func catch(err *error) {
40 if r := recover(); r != nil {
41 *err = fmt.Errorf("%v", r)
42 }
43}
44
45//---------------------------------------------------------------------------------------
46
47func WriteTextToFile(outputFile string, text string) {
48 f, err := os.Create(outputFile)
49 if err != nil {
50 fmt.Println(err)
51 }
52
53 defer f.Close()
54
55 _, err2 := f.WriteString(text)
56 if err2 != nil {
57 fmt.Println(err2)
58 }
59}
60
61//---------------------------------------------------------------------------------------
62// The following sample illustrates how to extract tables from PDF documents.
63//---------------------------------------------------------------------------------------
64
65func TabularDataTest() (err error) {
66 defer catch(&err)
67
68 PDFNetAddResourceSearchPath(modulePath)
69
70 // Test if the add-on is installed
71 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Tabular) {
72 fmt.Println("")
73 fmt.Println("Unable to run Data Extraction: PDFTron SDK Tabular Data module not available.")
74 fmt.Println("-----------------------------------------------------------------------------")
75 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
76 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
77 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
78 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
79 fmt.Println("")
80 return nil
81 }
82
83 // Extract tabular data as a JSON file
84 fmt.Println("Extract tabular data as a JSON file")
85
86 inputFile := inputPath + "table.pdf"
87 outputFile := outputPath + "table.json"
88 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Tabular)
89
90 fmt.Println("Result saved in " + outputFile)
91
92 // Extract tabular data as a JSON string
93 fmt.Println("Extract tabular data as a JSON string")
94
95 inputFile = inputPath + "financial.pdf"
96 outputFile = outputPath + "financial.json"
97
98 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Tabular).(string)
99 WriteTextToFile(outputFile, json)
100
101 fmt.Println("Result saved in " + outputFile)
102
103 // Extract tabular data as an XLSX file
104 fmt.Println("Extract tabular data as an XLSX file")
105
106 inputFile = inputPath + "table.pdf"
107 outputFile = outputPath + "table.xlsx"
108 DataExtractionModuleExtractToXLSX(inputFile, outputFile)
109
110 fmt.Println("Result saved in " + outputFile)
111
112 // Extract tabular data as an XLSX stream (also known as filter)
113 fmt.Println("Extract tabular data as an XLSX stream")
114
115 inputFile = inputPath + "financial.pdf"
116 outputFile = outputPath + "financial.xlsx"
117 outputXlsxStream := NewMemoryFilter(0, false)
118 outputFilter := NewFilter(outputXlsxStream)
119 options := NewDataExtractionOptions()
120 options.SetPages("1"); // page 1
121 DataExtractionModuleExtractToXLSX(inputFile, outputFilter, options)
122 outputXlsxStream.SetAsInputFilter()
123 outputXlsxStream.WriteToFile(outputFile, false)
124
125 fmt.Println("Result saved in " + outputFile)
126
127 return nil
128}
129
130//---------------------------------------------------------------------------------------
131// The following sample illustrates how to extract document structure from PDF documents.
132//---------------------------------------------------------------------------------------
133
134func DocumentStructureTest() (err error) {
135 defer catch(&err)
136
137 // Test if the add-on is installed
138 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocStructure) {
139 fmt.Println("")
140 fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
141 fmt.Println("-----------------------------------------------------------------------------")
142 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
143 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
144 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
145 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
146 fmt.Println("")
147 return nil
148 }
149
150 // Extract document structure as a JSON file
151 fmt.Println("Extract document structure as a JSON file")
152
153 inputFile := inputPath + "paragraphs_and_tables.pdf"
154 outputFile := outputPath + "paragraphs_and_tables.json"
155 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocStructure)
156
157 fmt.Println("Result saved in " + outputFile)
158
159 // Extract document structure as a JSON string
160 fmt.Println("Extract document structure as a JSON string")
161
162 inputFile = inputPath + "tagged.pdf"
163 outputFile = outputPath + "tagged.json"
164 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocStructure).(string)
165 WriteTextToFile(outputFile, json)
166
167 fmt.Println("Result saved in " + outputFile)
168
169 return nil
170}
171
172//---------------------------------------------------------------------------------------
173// The following sample illustrates how to extract form fields from PDF documents.
174//---------------------------------------------------------------------------------------
175
176func FormFieldsTest() (err error) {
177 defer catch(&err)
178
179 // Test if the add-on is installed
180 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Form) {
181 fmt.Println("")
182 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
183 fmt.Println("-----------------------------------------------------------------------------")
184 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
185 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
186 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
187 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
188 fmt.Println("")
189 return nil
190 }
191
192 // Extract form fields as a JSON file
193 fmt.Println("Extract form fields as a JSON file")
194
195 inputFile := inputPath + "formfields-scanned.pdf"
196 outputFile := outputPath + "formfields-scanned.json"
197 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Form)
198
199 fmt.Println("Result saved in " + outputFile)
200
201 // Extract form fields as a JSON string
202 fmt.Println("Extract form fields as a JSON string")
203
204 inputFile = inputPath + "formfields.pdf"
205 outputFile = outputPath + "formfields.json"
206
207 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Form).(string)
208 WriteTextToFile(outputFile, json)
209
210 fmt.Println("Result saved in " + outputFile)
211
212 //////////////////////////////////////////////////////////////////////////
213 // Detect and add form fields to a PDF document.
214 // PDF document already has form fields, and this sample will update to new found fields.
215 doc := NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
216
217 fmt.Println("Extract form fields as a PDF file, keep new fields")
218 DataExtractionModuleDetectAndAddFormFieldsToPDF(doc)
219
220 outputFile = outputPath + "formfields-scanned-fields-new.pdf"
221 doc.Save(outputFile, uint(SDFDocE_linearized))
222 doc.Close()
223
224 fmt.Println("Result saved in " + outputFile)
225
226 //////////////////////////////////////////////////////////////////////////
227 // Detect and add form fields to a PDF document.
228 // PDF document already has form fields, and this sample will keep the original fields.
229 doc = NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
230
231 // Setup DataExtractionOptions to keep old fields
232 options := NewDataExtractionOptions()
233 options.SetOverlappingFormFieldBehavior("KeepOld")
234
235 fmt.Println("Extract form fields as a PDF file, keep old fields")
236 DataExtractionModuleDetectAndAddFormFieldsToPDF(doc, options)
237
238 outputFile = outputPath + "formfields-scanned-fields-old.pdf"
239 doc.Save(outputFile, uint(SDFDocE_linearized))
240 doc.Close()
241
242 fmt.Println("Result saved in " + outputFile)
243
244 return nil
245}
246
247//---------------------------------------------------------------------------------------
248// The following sample illustrates how to extract key-value pairs from PDF documents.
249//---------------------------------------------------------------------------------------
250
251func GenericKeyValueTest() (err error) {
252 defer catch(&err)
253
254 // Test if the add-on is installed
255 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_GenericKeyValue) {
256 fmt.Println("")
257 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
258 fmt.Println("-----------------------------------------------------------------------------")
259 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
260 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
261 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
262 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
263 fmt.Println("")
264 return nil
265 }
266
267 fmt.Println("Extract key-value pairs from a PDF")
268
269 inputFile := inputPath + "newsletter.pdf"
270 outputFile := outputPath + "newsletter_key_val.json"
271 // Simple example: Extract Keys & Values as a JSON file
272 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue)
273
274 fmt.Println("Result saved in " + outputFile)
275
276 // Example with customized options:
277 // Extract Keys & Values from pages 2-4, excluding ads
278 options := NewDataExtractionOptions()
279 options.SetPages("2-4")
280
281 p2ExclusionZones := NewRectCollection()
282 // Exclude the ad on page 2
283 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
284 // Coordinates rotate with the page, if it has rotation applied.
285 p2ExclusionZones.AddRect(NewRect(166, 47, 562, 222))
286 options.AddExclusionZonesForPage(p2ExclusionZones, 2)
287
288 p4InclusionZones := NewRectCollection()
289 p4ExclusionZones := NewRectCollection()
290 // Only include the article text for page 4, exclude ads and headings
291 p4InclusionZones.AddRect(NewRect(30, 432, 562, 684))
292 p4ExclusionZones.AddRect(NewRect(30, 657, 295, 684))
293 options.AddInclusionZonesForPage(p4InclusionZones, 4)
294 options.AddExclusionZonesForPage(p4ExclusionZones, 4)
295
296 fmt.Println("Extract Key-Value pairs from specific pages and zones as a JSON file")
297 outputFile = outputPath + "newsletter_key_val_with_zones.json"
298 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue, options)
299
300 fmt.Println("Result saved in " + outputFile)
301
302 return nil
303}
304
305//---------------------------------------------------------------------------------------
306
307func TestDataExtraction(t *testing.T) {
308 // The first step in every application using PDFNet is to initialize the
309 // library. The library is usually initialized only once, but calling
310 // Initialize() multiple times is also fine.
311 PDFNetInitialize(licenseKey)
312
313 //-----------------------------------------------------------------------------------
314
315 PDFNetAddResourceSearchPath("../../../PDFNetC/Lib/")
316
317 //-----------------------------------------------------------------------------------
318
319 err := TabularDataTest()
320 if err != nil {
321 fmt.Println(fmt.Errorf("Unable to extract tabular data, error: %s", err))
322 }
323
324 //-----------------------------------------------------------------------------------
325
326 err = DocumentStructureTest()
327 if err != nil {
328 fmt.Println(fmt.Errorf("Unable to extract document structure data, error: %s", err))
329 }
330
331 //-----------------------------------------------------------------------------------
332
333 err = FormFieldsTest()
334 if err != nil {
335 fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
336 }
337
338 err = GenericKeyValueTest()
339 if err != nil {
340 fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
341 }
342
343 //-----------------------------------------------------------------------------------
344
345 PDFNetTerminate()
346 fmt.Println("Done.")
347}
348
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.io.FileWriter;
7import java.io.BufferedWriter;
8import java.io.FileNotFoundException;
9import java.io.IOException;
10
11import com.pdftron.common.PDFNetException;
12import com.pdftron.pdf.*;
13import com.pdftron.filters.*;
14import com.pdftron.sdf.SDFDoc;
15
16//---------------------------------------------------------------------------------------
17// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18// extract various types of data from PDF documents.
19//
20// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
21//---------------------------------------------------------------------------------------
22
23public class DataExtractionTest {
24
25 static void writeTextToFile(String filename, String text) throws IOException
26 {
27 BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
28 writer.write(text);
29 writer.close();
30 }
31
32 //---------------------------------------------------------------------------------------
33 // The following sample illustrates how to extract tables from PDF documents.
34 //---------------------------------------------------------------------------------------
35 static void testTabularData()
36 {
37 try {
38 // Test if the add-on is installed
39 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
40 {
41 System.out.println();
42 System.out.println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
43 System.out.println("---------------------------------------------------------------");
44 System.out.println("The Data Extraction suite is an optional add-on, available for download");
45 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
46 System.out.println("module, ensure that the SDK is able to find the required files");
47 System.out.println("using the PDFNet.addResourceSearchPath() function." );
48 System.out.println();
49 return;
50 }
51 } catch (PDFNetException e) {
52 System.out.println("Data Extraction module not available, error:");
53 e.printStackTrace();
54 System.out.println(e);
55 }
56
57 // Relative path to the folder containing test files.
58 String input_path = "../../TestFiles/";
59 String output_path = "../../TestFiles/Output/";
60
61 try {
62 // Extract tabular data as a JSON file
63 DataExtractionModule.extractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
64
65 // Extract tabular data as a JSON string
66 String json = DataExtractionModule.extractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
67 writeTextToFile(output_path + "financial.json", json);
68
69 // Extract tabular data as an XLSX file
70 DataExtractionModule.extractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
71
72 // Extract tabular data as an XLSX stream (also known as filter)
73 DataExtractionOptions options = new DataExtractionOptions();
74 options.setPages("1");
75 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
76 DataExtractionModule.extractToXLSX(input_path + "financial.pdf", output_xlsx_stream, options);
77 output_xlsx_stream.setAsInputFilter();
78 output_xlsx_stream.writeToFile(output_path + "financial.xlsx", false);
79
80 } catch (PDFNetException e) {
81 System.out.println(e);
82 }
83 catch (IOException e) {
84 System.out.println(e);
85 }
86 }
87
88 //---------------------------------------------------------------------------------------
89 // The following sample illustrates how to extract document structure from PDF documents.
90 //---------------------------------------------------------------------------------------
91 static void testDocumentStructure()
92 {
93 // Test if the add-on is installed
94 try {
95 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
96 {
97 System.out.println();
98 System.out.println("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
99 System.out.println("---------------------------------------------------------------");
100 System.out.println("The Data Extraction suite is an optional add-on, available for download");
101 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
102 System.out.println("module, ensure that the SDK is able to find the required files");
103 System.out.println("using the PDFNet.addResourceSearchPath() function." );
104 System.out.println();
105 return;
106 }
107 } catch (PDFNetException e) {
108 System.out.println("Data Extraction module not available, error:");
109 e.printStackTrace();
110 System.out.println(e);
111 }
112
113 // Relative path to the folder containing test files.
114 String input_path = "../../TestFiles/";
115 String output_path = "../../TestFiles/Output/";
116
117 try {
118 // Extract document structure as a JSON file
119 DataExtractionModule.extractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
120
121 // Extract document structure as a JSON string
122 String json = DataExtractionModule.extractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
123 writeTextToFile(output_path + "tagged.json", json);
124
125 } catch (PDFNetException e) {
126 System.out.println(e);
127 }
128 catch (IOException e) {
129 System.out.println(e);
130 }
131 }
132
133 //---------------------------------------------------------------------------------------
134 // The following sample illustrates how to extract form fields from PDF documents.
135 //---------------------------------------------------------------------------------------
136 static void testFormFields()
137 {
138 try {
139 // Test if the add-on is installed
140 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
141 {
142 System.out.println();
143 System.out.println("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
144 System.out.println("---------------------------------------------------------------");
145 System.out.println("The Data Extraction suite is an optional add-on, available for download");
146 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
147 System.out.println("module, ensure that the SDK is able to find the required files");
148 System.out.println("using the PDFNet.addResourceSearchPath() function." );
149 System.out.println();
150 return;
151 }
152 } catch (PDFNetException e) {
153 System.out.println("Data Extraction module not available, error:");
154 e.printStackTrace();
155 System.out.println(e);
156 }
157
158 // Relative path to the folder containing test files.
159 String input_path = "../../TestFiles/";
160 String output_path = "../../TestFiles/Output/";
161
162 try {
163 // Extract form fields as a JSON file
164 DataExtractionModule.extractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
165
166 // Extract form fields as a JSON string
167 String json = DataExtractionModule.extractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
168 writeTextToFile(output_path + "formfields.json", json);
169
170 //---------------------------------------------------------------------------------------
171 // Detect and add form fields to a PDF document.
172 // PDF document already has form fields, and this sample will update to new found fields.
173 //---------------------------------------------------------------------------------------
174 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
175 {
176 DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
177
178 // Save the modfied pdf document
179 doc.save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveMode.LINEARIZED, null);
180 } catch (Exception e) {
181 e.printStackTrace();
182 }
183
184 //---------------------------------------------------------------------------------------
185 // Detect and add form fields to a PDF document.
186 // PDF document already has form fields, and this sample will keep the original fields.
187 //---------------------------------------------------------------------------------------
188 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
189 {
190 // Setup DataExtractionOptions to keep old fields
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.setOverlappingFormFieldBehavior("KeepOld");
193
194 DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
195
196 // Save the modfied pdf document
197 doc.save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198 } catch (Exception e) {
199 e.printStackTrace();
200 }
201
202 } catch (PDFNetException e) {
203 System.out.println(e);
204 }
205 catch (IOException e) {
206 System.out.println(e);
207 }
208 }
209
210 //---------------------------------------------------------------------------------------
211 // The following sample illustrates how to extract key-value pairs from PDF documents.
212 //---------------------------------------------------------------------------------------
213 public static void testGenericKeyValue() {
214 try {
215 // Test if the add-on is installed
216 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
217 {
218 System.out.println();
219 System.out.println("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
220 System.out.println("---------------------------------------------------------------");
221 System.out.println("The Data Extraction suite is an optional add-on, available for download");
222 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
223 System.out.println("module, ensure that the SDK is able to find the required files");
224 System.out.println("using the PDFNet.addResourceSearchPath() function." );
225 System.out.println();
226 return;
227 }
228 } catch (PDFNetException e) {
229 System.out.println("Data Extraction module not available, error:");
230 e.printStackTrace();
231 System.out.println(e);
232 }
233
234 // Relative path to the folder containing test files.
235 String input_path = "../../TestFiles/";
236 String output_path = "../../TestFiles/Output/";
237
238 try {
239
240 // Simple example: Extract Keys & Values as a JSON file
241 DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
242
243 // Example with customized options:
244 // Extract Keys & Values from pages 2-4, excluding ads
245 DataExtractionOptions options = new DataExtractionOptions();
246 options.setPages("2-4");
247
248 RectCollection p2ExclusionZones = new RectCollection();
249 // Exclude the ad on page 2
250 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
251 // Coordinates rotate with the page, if it has rotation applied.
252 p2ExclusionZones.addRect(166, 47, 562, 222);
253 options.addExclusionZonesForPage(p2ExclusionZones, 2);
254
255 RectCollection p4InclusionZones = new RectCollection();
256 RectCollection p4ExclusionZones = new RectCollection();
257 // Only include the article text for page 4, exclude ads and headings
258 p4InclusionZones.addRect(30, 432, 562, 684);
259 p4ExclusionZones.addRect(30, 657, 295, 684);
260 options.addInclusionZonesForPage(p4InclusionZones, 4);
261 options.addExclusionZonesForPage(p4ExclusionZones, 4);
262
263 DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
264
265 } catch (Exception e) {
266 System.out.println(e);
267 }
268 }
269
270 public static void main(String[] args)
271 {
272 // The first step in every application using PDFNet is to initialize the
273 // library and set the path to common PDF resources. The library is usually
274 // initialized only once, but calling initialize() multiple times is also fine.
275 PDFNet.initialize(PDFTronLicense.Key());
276 PDFNet.addResourceSearchPath("../../../Lib/");
277
278 testTabularData();
279 testDocumentStructure();
280 testFormFields();
281 testGenericKeyValue();
282
283 PDFNet.terminate();
284 }
285}
286
1 <?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10//---------------------------------------------------------------------------------------
11// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
12// extract various types of data from PDF documents.
13//
14// The PDFTron SDK Data Extraction suite can be downloaded from
15// https://docs.apryse.com/documentation/core/info/modules/
16//
17// Please contact us if you have any questions.
18//---------------------------------------------------------------------------------------
19
20function WriteTextToFile($outputFile, $text)
21{
22 $outfile = fopen($outputFile, "w");
23 fwrite($outfile, $text);
24 fclose($outfile);
25}
26
27function main()
28{
29 // Relative path to the folder containing the test files.
30 $inputPath = getcwd()."/../../TestFiles/";
31 $outputPath = $inputPath."Output/";
32
33 // The first step in every application using PDFNet is to initialize the
34 // library. The library is usually initialized only once, but calling
35 // Initialize() multiple times is also fine.
36 global $LicenseKey;
37 PDFNet::Initialize($LicenseKey);
38 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
39
40 //-----------------------------------------------------------------------------------
41
42 PDFNet::AddResourceSearchPath("../../../PDFNetC/Lib/");
43
44 //////////////////////////////////////////////////////////////////////////
45 // The following sample illustrates how to extract tables from PDF documents.
46 //////////////////////////////////////////////////////////////////////////
47
48 // Test if the add-on is installed
49 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular)) {
50 echo(nl2br("\n"));
51 echo(nl2br("Unable to run Data Extraction: PDFTron SDK Tabular Data module not available.\n"));
52 echo(nl2br("-----------------------------------------------------------------------------\n"));
53 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
54 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
55 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
56 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
57 echo(nl2br("\n"));
58 }
59 else {
60 try {
61 // Extract tabular data as a JSON file
62 echo(nl2br("Extract tabular data as a JSON file\n"));
63
64 $outputFile = $outputPath."table.json";
65 DataExtractionModule::ExtractData($inputPath."table.pdf", $outputFile, DataExtractionModule::e_Tabular);
66
67 echo(nl2br("Result saved in " . $outputFile . "\n"));
68
69 ///////////////////////////////////////////////////////
70 // Extract tabular data as a JSON string
71 echo(nl2br("Extract tabular data as a JSON string\n"));
72
73 $outputFile = $outputPath."financial.json";
74 $json = DataExtractionModule::ExtractData($inputPath."financial.pdf", DataExtractionModule::e_Tabular);
75 WriteTextToFile($outputFile, $json);
76
77 echo(nl2br("Result saved in " . $outputFile . "\n"));
78
79 ///////////////////////////////////////////////////////
80 // Extract tabular data as an XLSX file
81 echo(nl2br("Extract tabular data as an XLSX file\n"));
82
83 $outputFile = $outputPath."table.xlsx";
84 DataExtractionModule::ExtractToXLSX($inputPath."table.pdf", $outputFile);
85
86 echo(nl2br("Result saved in " . $outputFile . "\n"));
87
88 ///////////////////////////////////////////////////////
89 // Extract tabular data as an XLSX stream (also known as filter)
90 echo(nl2br("Extract tabular data as an XLSX stream\n"));
91
92 $outputFile = $outputPath."financial.xlsx";
93 $outputXlsxStream = new MemoryFilter(0, false);
94 $options = new DataExtractionOptions();
95 $options->SetPages("1"); // page 1
96 DataExtractionModule::ExtractToXLSX($inputPath."financial.pdf", $outputXlsxStream, $options);
97 $outputXlsxStream->SetAsInputFilter();
98 $outputXlsxStream->WriteToFile($outputFile, false);
99
100 echo(nl2br("Result saved in " . $outputFile . "\n"));
101 }
102 catch(Exception $e) {
103 echo(nl2br("Unable to extract tabular data, error: " . $e->getMessage() . "\n"));
104 }
105 }
106
107 //////////////////////////////////////////////////////////////////////////
108 // The following sample illustrates how to extract document structure from PDF documents.
109 //////////////////////////////////////////////////////////////////////////
110
111 // Test if the add-on is installed
112 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure)) {
113 echo(nl2br("\n"));
114 echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
115 echo(nl2br("-----------------------------------------------------------------------------\n"));
116 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
117 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
118 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
119 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
120 echo(nl2br("\n"));
121 }
122 else {
123 try {
124 // Extract document structure as a JSON file
125 echo(nl2br("Extract document structure as a JSON file\n"));
126
127 $outputFile = $outputPath."paragraphs_and_tables.json";
128 DataExtractionModule::ExtractData($inputPath."paragraphs_and_tables.pdf", $outputFile, DataExtractionModule::e_DocStructure);
129
130 echo(nl2br("Result saved in " . $outputFile . "\n"));
131
132 ///////////////////////////////////////////////////////
133 // Extract document structure as a JSON string
134 echo(nl2br("Extract document structure as a JSON string\n"));
135
136 $outputFile = $outputPath."tagged.json";
137 $json = DataExtractionModule::ExtractData($inputPath."tagged.pdf", DataExtractionModule::e_DocStructure);
138 WriteTextToFile($outputFile, $json);
139
140 echo(nl2br("Result saved in " . $outputFile . "\n"));
141 }
142 catch(Exception $e) {
143 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
144 }
145 }
146
147 //////////////////////////////////////////////////////////////////////////
148 // The following sample illustrates how to extract form fields from PDF documents.
149 //////////////////////////////////////////////////////////////////////////
150
151 // Test if the add-on is installed
152 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form)) {
153 echo(nl2br("\n"));
154 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.\n"));
155 echo(nl2br("-----------------------------------------------------------------------------\n"));
156 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
157 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
158 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
159 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
160 echo(nl2br("\n"));
161 }
162 else {
163 try {
164 // Extract form fields as a JSON file
165 echo(nl2br("Extract form fields as a JSON file\n"));
166
167 $outputFile = $outputPath."formfields-scanned.json";
168 DataExtractionModule::ExtractData($inputPath."formfields-scanned.pdf", $outputFile, DataExtractionModule::e_Form);
169
170 echo(nl2br("Result saved in " . $outputFile . "\n"));
171
172 ///////////////////////////////////////////////////////
173 // Extract form fields as a JSON string
174 echo(nl2br("Extract form fields as a JSON string\n"));
175
176 $outputFile = $outputPath."formfields.json";
177 $json = DataExtractionModule::ExtractData($inputPath."formfields.pdf", DataExtractionModule::e_Form);
178 WriteTextToFile($outputFile, $json);
179
180 echo(nl2br("Result saved in " . $outputFile . "\n"));
181
182 ///////////////////////////////////////////////////////
183 // Detect and add form fields to a PDF document.
184 // PDF document already has form fields, and this sample will update to new found fields.
185 echo(nl2br("Extract form fields as a PDF file\n"));
186
187 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
188 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc);
189 $doc->Save($outputPath."formfields-scanned-fields-new.pdf", SDFDoc::e_linearized);
190 $doc->Close();
191
192 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-new.pdf" . "\n"));
193
194 ///////////////////////////////////////////////////////
195 // Detect and add form fields to a PDF document.
196 // PDF document already has form fields, and this sample will keep the original fields.
197 echo(nl2br("Extract form fields as a PDF file\n"));
198
199 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
200 $options = new DataExtractionOptions();
201 $options->SetOverlappingFormFieldBehavior("KeepOld");
202 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc, $options);
203 $doc->Save($outputPath."formfields-scanned-fields-old.pdf", SDFDoc::e_linearized);
204 $doc->Close();
205
206 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-old.pdf" . "\n"));
207
208 }
209 catch(Exception $e) {
210 echo(nl2br("Unable to extract form fields data, error: " . $e->getMessage() . "\n"));
211 }
212 }
213
214 //////////////////////////////////////////////////////////////////////////
215 // The following sample illustrates how to extract document structure from PDF documents.
216 //////////////////////////////////////////////////////////////////////////
217
218 // Test if the add-on is installed
219 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue)) {
220 echo(nl2br("\n"));
221 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.\n"));
222 echo(nl2br("-----------------------------------------------------------------------------\n"));
223 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
224 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
225 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
226 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
227 echo(nl2br("\n"));
228 }
229 else {
230 try {
231
232 echo(nl2br("Extract key-value pairs from a PDF\n"));
233 // Simple example: Extract Keys & Values as a JSON file
234 $outputFile = $outputPath."newsletter_key_val.json";
235 DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue);
236
237 echo(nl2br("Result saved in " . $outputFile . "\n"));
238
239 // Example with customized options:
240 // Extract Keys & Values from pages 2-4, excluding ads
241 $options = new DataExtractionOptions();
242 $options->setPages("2-4");
243
244 $p2ExclusionZones = new RectCollection();
245 // Exclude the ad on page 2
246 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
247 // Coordinates rotate with the page, if it has rotation applied.
248 $p2ExclusionZones->AddRect(new Rect(166.0, 47.0, 562.0, 222.0));
249 $options->AddExclusionZonesForPage($p2ExclusionZones, 2);
250
251 $p4InclusionZones = new RectCollection();
252 $p4ExclusionZones = new RectCollection();
253 // Only include the article text for page 4, exclude ads and headings
254 $p4InclusionZones->AddRect(new Rect(30.0, 432.0, 562.0, 684.0));
255 $p4ExclusionZones->AddRect(new Rect(30.0, 657.0, 295.0, 684.0));
256 $options->AddInclusionZonesForPage($p4InclusionZones, 4);
257 $options->AddExclusionZonesForPage($p4ExclusionZones, 4);
258
259 echo(nl2br("Extract Key-Value pairs from specific pages and zones as a JSON file\n"));
260 $outputFile = $outputPath."newsletter_key_val_with_zones.json";
261 DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue, $options);
262
263 echo(nl2br("Result saved in " . $outputFile . "\n"));
264 }
265 catch(Exception $e) {
266 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
267 }
268 }
269
270 //-----------------------------------------------------------------------------------
271
272 PDFNet::Terminate();
273 echo(nl2br("Done.\n"));
274}
275
276main();
277?>
278
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
8// extract various types of data from PDF documents.
9//
10// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
11//---------------------------------------------------------------------------------------
12
13const fs = require('fs');
14const { PDFNet } = require('../../lib/pdfnet.js');
15const PDFTronLicense = require('../../LicenseKey/NODEJS/LicenseKey');
16
17((exports) => {
18 'use strict';
19
20 exports.runDataExtractionTest = () => {
21
22 const main = async () => {
23
24 const inputPath = '../TestFiles/';
25 const outputPath = '../TestFiles/Output/';
26
27 //////////////////////////////////////////////////////////////////////////
28
29 await PDFNet.addResourceSearchPath('../../lib/');
30
31 //////////////////////////////////////////////////////////////////////////
32 // The following sample illustrates how to extract tables from PDF documents.
33 //////////////////////////////////////////////////////////////////////////
34
35 // Test if the add-on is installed
36 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular)) {
37 console.log('\nUnable to run Data Extraction: Apryse SDK Tabular Data module not available.');
38 console.log('---------------------------------------------------------------');
39 console.log('The Data Extraction suite is an optional add-on, available for download');
40 console.log('at https://docs.apryse.com/documentation/core/info/modules/. If you have already');
41 console.log('downloaded this module, ensure that the SDK is able to find the required files');
42 console.log('using the PDFNet.addResourceSearchPath() function.\n');
43 }
44 else
45 {
46 try {
47 // Extract tabular data as a JSON file
48 console.log('Extract tabular data as a JSON file');
49
50 let outputFile = outputPath + 'table.json';
51 await PDFNet.DataExtractionModule.extractData(inputPath + 'table.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
52
53 console.log('Result saved in ' + outputFile);
54
55 ///////////////////////////////////////////////////////
56 // Extract tabular data as a JSON string
57 console.log('Extract tabular data as a JSON string');
58
59 outputFile = outputPath + 'financial.json';
60 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'financial.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
61 fs.writeFileSync(outputFile, json);
62
63 console.log('Result saved in ' + outputFile);
64
65 ///////////////////////////////////////////////////////
66 // Extract tabular data as an XLSX file
67 console.log('Extract tabular data as an XLSX file');
68
69 outputFile = outputPath + 'table.xlsx';
70 await PDFNet.DataExtractionModule.extractToXLSX(inputPath + 'table.pdf', outputFile);
71
72 console.log('Result saved in ' + outputFile);
73
74 ///////////////////////////////////////////////////////
75 // Extract tabular data as an XLSX stream (also known as filter)
76 console.log('Extract tabular data as an XLSX stream');
77
78 outputFile = outputPath + 'financial.xlsx';
79 const outputXlsxStream = await PDFNet.Filter.createMemoryFilter(0, false);
80 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
81 options.setPages('1'); // page 1
82 await PDFNet.DataExtractionModule.extractToXLSXWithFilter(inputPath + 'financial.pdf', outputXlsxStream, options);
83 outputXlsxStream.memoryFilterSetAsInputFilter();
84 outputXlsxStream.writeToFile(outputFile, false);
85
86 console.log('Result saved in ' + outputFile);
87 } catch (err) {
88 console.log(err);
89 }
90 }
91
92 //////////////////////////////////////////////////////////////////////////
93 // The following sample illustrates how to extract document structure from PDF documents.
94 //////////////////////////////////////////////////////////////////////////
95
96 // Test if the add-on is installed
97 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure)) {
98 console.log('\nUnable to run Data Extraction: Apryse SDK Structured Output module not available.');
99 console.log('---------------------------------------------------------------');
100 console.log('The Data Extraction suite is an optional add-on, available for download');
101 console.log('at https://docs.apryse.com/documentation/core/info/modules/. If you have already');
102 console.log('downloaded this module, ensure that the SDK is able to find the required files');
103 console.log('using the PDFNet.addResourceSearchPath() function.\n');
104 }
105 else
106 {
107 try {
108 // Extract document structure as a JSON file
109 console.log('Extract document structure as a JSON file');
110
111 let outputFile = outputPath + 'paragraphs_and_tables.json';
112 await PDFNet.DataExtractionModule.extractData(inputPath + 'paragraphs_and_tables.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
113
114 console.log('Result saved in ' + outputFile);
115
116 ///////////////////////////////////////////////////////
117 // Extract document structure as a JSON string
118 console.log('Extract document structure as a JSON string');
119
120 outputFile = outputPath + 'tagged.json';
121 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'tagged.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
122 fs.writeFileSync(outputFile, json);
123
124 console.log('Result saved in ' + outputFile);
125 } catch (err) {
126 console.log(err);
127 }
128 }
129
130 //////////////////////////////////////////////////////////////////////////
131 // The following sample illustrates how to extract form fields from PDF documents.
132 //////////////////////////////////////////////////////////////////////////
133
134 // Test if the add-on is installed
135 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Form)) {
136 console.log('\nUnable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.');
137 console.log('---------------------------------------------------------------');
138 console.log('The Data Extraction suite is an optional add-on, available for download');
139 console.log('at https://docs.apryse.com/documentation/core/info/modules/. If you have already');
140 console.log('downloaded this module, ensure that the SDK is able to find the required files');
141 console.log('using the PDFNet.addResourceSearchPath() function.\n');
142 }
143 else
144 {
145 try {
146 // Extract form fields as a JSON file
147 console.log('Extract form fields as a JSON file');
148
149 let outputFile = outputPath + 'formfields-scanned.json';
150 await PDFNet.DataExtractionModule.extractData(inputPath + 'formfields-scanned.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
151
152 console.log('Result saved in ' + outputFile);
153
154 ///////////////////////////////////////////////////////
155 // Extract form fields as a JSON string
156 console.log('Extract form fields as a JSON string');
157
158 outputFile = outputPath + 'formfields.json';
159 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'formfields.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
160 fs.writeFileSync(outputFile, json);
161
162 console.log('Result saved in ' + outputFile);
163
164 //////////////////////////////////////////////////////////////////////////
165 // Detect and add form fields to a PDF document.
166 // Document already has form fields, and this sample will update to new found fields.
167 {
168 console.log('Detect and add form fields in a PDF file, keep new fields');
169
170 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
171
172 await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
173 outputFile = outputPath + 'formfields-scanned-fields-new.pdf';
174 await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
175
176 console.log('Result saved in ' + outputFile);
177 }
178
179 //////////////////////////////////////////////////////////////////////////
180 // Detect and add form fields to a PDF document.
181 // Document already has form fields, and this sample will keep the original fields.
182 {
183 console.log('Detect and add form fields in a PDF file, keep old fields');
184
185 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
186
187 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
188 options.setOverlappingFormFieldBehavior('KeepOld');
189
190 await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
191 outputFile = outputPath + 'formfields-scanned-fields-old.pdf';
192 await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
193 }
194
195 console.log('Result saved in ' + outputFile);
196
197 } catch (err) {
198 console.log(err);
199 }
200 }
201
202 //////////////////////////////////////////////////////////////////////////
203 // The following sample illustrates how to extract key-value pairs from PDF documents.
204 //////////////////////////////////////////////////////////////////////////
205 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue)) {
206 console.log();
207 console.log('Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.');
208 console.log('---------------------------------------------------------------');
209 console.log('The Data Extraction suite is an optional add-on, available for download');
210 console.log('at http://www.pdftron.com/. If you have already downloaded this');
211 console.log('module, ensure that the SDK is able to find the required files');
212 console.log('using the PDFNet.addResourceSearchPath() function.');
213 console.log();
214 }
215 else
216 {
217 try {
218 // Simple example: Extract Keys & Values as a JSON file
219 console.log('Extract Key-Value pairs as a JSON file');
220 await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue);
221 console.log('Result saved in ' + outputPath + 'newsletter_key_val.json');
222
223 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
224 options.setPages('2-4');
225
226 const p2ExclusionZones = [];
227 // Exclude the ad on page 2
228 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
229 // Coordinates rotate with the page, if it has rotation applied.
230 p2ExclusionZones.push(new PDFNet.Rect(166, 47, 562, 222));
231 options.addExclusionZonesForPage(p2ExclusionZones, 2);
232
233 const p4InclusionZones = [];
234 const p4ExclusionZones = [];
235 // Only include the article text for page 4, exclude ads and headings
236 p4InclusionZones.push(new PDFNet.Rect(30, 432, 562, 684));
237 p4ExclusionZones.push(new PDFNet.Rect(30, 657, 295, 684));
238 options.addInclusionZonesForPage(p4InclusionZones, 4);
239 options.addExclusionZonesForPage(p4ExclusionZones, 4);
240 console.log('Extract Key-Value pairs from specific pages and zones as a JSON file');
241 await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val_with_zones.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue, options);
242 console.log('Result saved in ' + outputPath + 'newsletter_key_val_with_zones.json');
243 } catch (err) {
244 console.log(err);
245 }
246 }
247 //////////////////////////////////////////////////////////////////////////
248
249 console.log('Done.');
250 };
251
252 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) {
253 console.log('Error: ' + JSON.stringify(error));
254 }).then(function () { return PDFNet.shutdown(); });
255 };
256 exports.runDataExtractionTest();
257})(exports);
258// eslint-disable-next-line spaced-comment
259//# sourceURL=DataExtractionTest.js
260
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11import platform
12
13sys.path.append("../../LicenseKey/PYTHON")
14from LicenseKey import *
15
16#---------------------------------------------------------------------------------------
17# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18# extract various types of data from PDF documents.
19#
20# The PDFTron SDK Data Extraction suite can be downloaded from
21# https://docs.apryse.com/documentation/core/info/modules/
22#
23# Please contact us if you have any questions.
24#---------------------------------------------------------------------------------------
25
26# Relative path to the folder containing the test files.
27inputPath = "../../TestFiles/"
28outputPath = "../../TestFiles/Output/"
29
30def WriteTextToFile(outputFile, text):
31 # Write the contents of text to the disk
32 f = open(outputFile, "w")
33 try:
34 f.write(text)
35 finally:
36 f.close()
37
38def main():
39 # The first step in every application using PDFNet is to initialize the
40 # library. The library is usually initialized only once, but calling
41 # Initialize() multiple times is also fine.
42 PDFNet.Initialize(LicenseKey)
43
44 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
45
46 #-----------------------------------------------------------------------------------
47 # The following sample illustrates how to extract tables from PDF documents.
48 #-----------------------------------------------------------------------------------
49
50 # Test if the add-on is installed
51 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Tabular):
52 print("")
53 print("Unable to run Data Extraction: PDFTron SDK Tabular Data module not available.")
54 print("-----------------------------------------------------------------------------")
55 print("The Data Extraction suite is an optional add-on, available for download")
56 print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
57 print("downloaded this module, ensure that the SDK is able to find the required files")
58 print("using the PDFNet.AddResourceSearchPath() function.")
59 print("")
60 else:
61 try:
62 # Extract tabular data as a JSON file
63 print("Extract tabular data as a JSON file")
64
65 outputFile = outputPath + "table.json"
66 DataExtractionModule.ExtractData(inputPath + "table.pdf", outputFile, DataExtractionModule.e_Tabular)
67
68 print("Result saved in " + outputFile)
69
70 #------------------------------------------------------
71 # Extract tabular data as a JSON string
72 print("Extract tabular data as a JSON string")
73
74 outputFile = outputPath + "financial.json"
75 json = DataExtractionModule.ExtractData(inputPath + "financial.pdf", DataExtractionModule.e_Tabular)
76 WriteTextToFile(outputFile, json)
77
78 print("Result saved in " + outputFile)
79
80 #------------------------------------------------------
81 # Extract tabular data as an XLSX file
82 print("Extract tabular data as an XLSX file")
83
84 outputFile = outputPath + "table.xlsx"
85 DataExtractionModule.ExtractToXLSX(inputPath + "table.pdf", outputFile)
86
87 print("Result saved in " + outputFile)
88
89 #------------------------------------------------------
90 # Extract tabular data as an XLSX stream (also known as filter)
91 print("Extract tabular data as an XLSX stream")
92
93 outputFile = outputPath + "financial.xlsx"
94 options = DataExtractionOptions()
95 options.SetPages("1") # page 1
96 outputXlsxStream = MemoryFilter(0, False)
97 DataExtractionModule.ExtractToXLSX(inputPath + "financial.pdf", outputXlsxStream, options)
98 outputXlsxStream.SetAsInputFilter()
99 outputXlsxStream.WriteToFile(outputFile, False)
100
101 print("Result saved in " + outputFile)
102 except Exception as e:
103 print("Unable to extract tabular data, error: " + str(e))
104
105 #-----------------------------------------------------------------------------------
106 # The following sample illustrates how to extract document structure from PDF documents.
107 #-----------------------------------------------------------------------------------
108
109 # Test if the add-on is installed
110 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocStructure):
111 print("")
112 print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
113 print("-----------------------------------------------------------------------------")
114 print("The Data Extraction suite is an optional add-on, available for download")
115 print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
116 print("downloaded this module, ensure that the SDK is able to find the required files")
117 print("using the PDFNet.AddResourceSearchPath() function.")
118 print("")
119 else:
120 try:
121 # Extract document structure as a JSON file
122 print("Extract document structure as a JSON file")
123
124 outputFile = outputPath + "paragraphs_and_tables.json"
125 DataExtractionModule.ExtractData(inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule.e_DocStructure)
126
127 print("Result saved in " + outputFile)
128
129 #------------------------------------------------------
130 # Extract document structure as a JSON string
131 print("Extract document structure as a JSON string")
132
133 outputFile = outputPath + "tagged.json"
134 json = DataExtractionModule.ExtractData(inputPath + "tagged.pdf", DataExtractionModule.e_DocStructure)
135 WriteTextToFile(outputFile, json)
136
137 print("Result saved in " + outputFile)
138 except Exception as e:
139 print("Unable to extract document structure data, error: " + str(e))
140
141 #-----------------------------------------------------------------------------------
142 # The following sample illustrates how to extract form fields from PDF documents.
143 #-----------------------------------------------------------------------------------
144
145 # Test if the add-on is installed
146 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Form):
147 print("")
148 print("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
149 print("-----------------------------------------------------------------------------")
150 print("The Data Extraction suite is an optional add-on, available for download")
151 print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
152 print("downloaded this module, ensure that the SDK is able to find the required files")
153 print("using the PDFNet.AddResourceSearchPath() function.")
154 print("")
155 else:
156 try:
157 # Extract form fields as a JSON file
158 print("Extract form fields as a JSON file")
159
160 outputFile = outputPath + "formfields-scanned.json"
161 DataExtractionModule.ExtractData(inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule.e_Form)
162
163 print("Result saved in " + outputFile)
164
165 #------------------------------------------------------
166 # Extract form fields as a JSON string
167 print("Extract form fields as a JSON string")
168
169 outputFile = outputPath + "formfields.json"
170 json = DataExtractionModule.ExtractData(inputPath + "formfields.pdf", DataExtractionModule.e_Form)
171 WriteTextToFile(outputFile, json)
172
173 print("Result saved in " + outputFile)
174
175 #-----------------------------------------------------------------------------------
176 # Detect and add form fields to a PDF document.
177 # PDF document already has form fields, and this sample will update to new found fields.
178 print("Extract form fields as a pdf file, update to new")
179
180 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
181
182 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
183
184 outputFile = outputPath + "formfields-scanned-fields-new.pdf"
185 doc.Save(outputFile, SDFDoc.e_linearized)
186 doc.Close()
187
188 print("Result saved in " + outputFile)
189
190 #-----------------------------------------------------------------------------------
191 # Detect and add form fields to a PDF document.
192 # PDF document already has form fields, and this sample will keep the original fields.
193 print("Extract form fields as a pdf file, keep original")
194
195 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
196
197 options = DataExtractionOptions()
198 options.SetOverlappingFormFieldBehavior("KeepOld")
199 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
200
201 outputFile = outputPath + "formfields-scanned-fields-old.pdf"
202 doc.Save(outputFile, SDFDoc.e_linearized)
203 doc.Close()
204
205 print("Result saved in " + outputFile)
206
207 except Exception as e:
208 print("Unable to extract form fields data, error: " + str(e))
209
210 #---------------------------------------------------------------------------------------
211 # The following sample illustrates how to extract key-value pairs from PDF documents.
212 #---------------------------------------------------------------------------------------
213 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_GenericKeyValue):
214 print()
215 print("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
216 print("---------------------------------------------------------------")
217 print("The Data Extraction suite is an optional add-on, available for download")
218 print("at http://www.pdftron.com/. If you have already downloaded this")
219 print("module, ensure that the SDK is able to find the required files")
220 print("using the PDFNet.AddResourceSearchPath() function.")
221 print()
222 else:
223 try:
224 print("Extract key-value pairs from a PDF")
225 # Simple example: Extract Keys & Values as a JSON file
226 DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val.json", DataExtractionModule.e_GenericKeyValue)
227 print("Result saved in " + outputPath + "newsletter_key_val.json")
228
229 # Example with customized options:
230 # Extract Keys & Values from pages 2-4, excluding ads
231 options = DataExtractionOptions()
232 options.SetPages("2-4")
233
234 p2_exclusion_zones = RectCollection()
235 # Exclude the ad on page 2
236 # These coordinates are in PDF user space, with the origin at the bottom left corner of the page
237 # Coordinates rotate with the page, if it has rotation applied.
238 p2_exclusion_zones.AddRect(Rect(166, 47, 562, 222))
239 options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
240
241 p4_inclusion_zones = RectCollection()
242 p4_exclusion_zones = RectCollection()
243 # Only include the article text for page 4, exclude ads and headings
244 p4_inclusion_zones.AddRect(Rect(30, 432, 562, 684))
245 p4_exclusion_zones.AddRect(Rect(30, 657, 295, 684))
246 options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
247 options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
248 print("Extract Key-Value pairs from specific pages and zones as a JSON file")
249 DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule.e_GenericKeyValue, options)
250 print("Result saved in " + outputPath + "newsletter_key_val_with_zones.json")
251 except Exception as e:
252 print("Unable to extract key-value data, error: " + str(e))
253
254
255 PDFNet.Terminate()
256 print("Done.")
257
258if __name__ == '__main__':
259 main()
260
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
14# extract various types of data from PDF documents.
15#
16# The PDFTron SDK Data Extraction suite can be downloaded from
17# https://docs.apryse.com/documentation/core/info/modules/
18#
19# Please contact us if you have any questions.
20#---------------------------------------------------------------------------------------
21
22# Relative path to the folder containing the test files.
23$inputPath = "../../TestFiles/"
24$outputPath = "../../TestFiles/Output/"
25
26def main()
27 # The first step in every application using PDFNet is to initialize the
28 # library. The library is usually initialized only once, but calling
29 # Initialize() multiple times is also fine.
30 PDFNet.Initialize(PDFTronLicense.Key)
31
32 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
33
34 #-----------------------------------------------------------------------------------
35 # The following sample illustrates how to extract tables from PDF documents.
36 #-----------------------------------------------------------------------------------
37
38 # Test if the add-on is installed
39 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Tabular) then
40 puts ""
41 puts "Unable to run Data Extraction: PDFTron SDK Tabular Data module not available."
42 puts "-----------------------------------------------------------------------------"
43 puts "The Data Extraction suite is an optional add-on, available for download"
44 puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
45 puts "downloaded this module, ensure that the SDK is able to find the required files"
46 puts "using the PDFNet.AddResourceSearchPath() function."
47 puts ""
48 else
49 begin
50 # Extract tabular data as a JSON file
51 puts "Extract tabular data as a JSON file"
52
53 outputFile = $outputPath + "table.json"
54 DataExtractionModule.ExtractData($inputPath + "table.pdf", outputFile, DataExtractionModule::E_Tabular)
55
56 puts "Result saved in " + outputFile
57
58 #------------------------------------------------------
59 # Extract tabular data as a JSON string
60 puts "Extract tabular data as a JSON string"
61
62 outputFile = $outputPath + "financial.json"
63 json = DataExtractionModule.ExtractData($inputPath + "financial.pdf", DataExtractionModule::E_Tabular)
64 File.open(outputFile, 'w') { |file| file.write(json) }
65
66 puts "Result saved in " + outputFile
67
68 #------------------------------------------------------
69 # Extract tabular data as an XLSX file
70 puts "Extract tabular data as an XLSX file"
71
72 outputFile = $outputPath + "table.xlsx"
73 DataExtractionModule.ExtractToXLSX($inputPath + "table.pdf", outputFile)
74
75 puts "Result saved in " + outputFile
76
77 #------------------------------------------------------
78 # Extract tabular data as an XLSX stream (also known as filter)
79 puts "Extract tabular data as an XLSX stream"
80
81 outputFile = $outputPath + "financial.xlsx"
82 outputXlsxStream = MemoryFilter.new(0, false)
83 options = DataExtractionOptions.new()
84 options.SetPages("1") # page 1
85 DataExtractionModule.ExtractToXLSX($inputPath + "financial.pdf", outputXlsxStream, options)
86 outputXlsxStream.SetAsInputFilter()
87 outputXlsxStream.WriteToFile(outputFile, false)
88
89 puts "Result saved in " + outputFile
90 rescue => error
91 puts "Unable to extract tabular data, error: " + error.message
92 end
93 end
94
95 #-----------------------------------------------------------------------------------
96 # The following sample illustrates how to extract document structure from PDF documents.
97 #-----------------------------------------------------------------------------------
98
99 # Test if the add-on is installed
100 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocStructure) then
101 puts ""
102 puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
103 puts "-----------------------------------------------------------------------------"
104 puts "The Data Extraction suite is an optional add-on, available for download"
105 puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
106 puts "downloaded this module, ensure that the SDK is able to find the required files"
107 puts "using the PDFNet.AddResourceSearchPath() function."
108 puts ""
109 else
110 begin
111 # Extract document structure as a JSON file
112 puts "Extract document structure as a JSON file"
113
114 outputFile = $outputPath + "paragraphs_and_tables.json"
115 DataExtractionModule.ExtractData($inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule::E_DocStructure)
116
117 puts "Result saved in " + outputFile
118
119 #------------------------------------------------------
120 # Extract document structure as a JSON string
121 puts "Extract document structure as a JSON string"
122
123 outputFile = $outputPath + "tagged.json"
124 json = DataExtractionModule.ExtractData($inputPath + "tagged.pdf", DataExtractionModule::E_DocStructure)
125 File.open(outputFile, 'w') { |file| file.write(json) }
126
127 puts "Result saved in " + outputFile
128 rescue => error
129 puts "Unable to extract document structure data, error: " + error.message
130 end
131 end
132
133 #-----------------------------------------------------------------------------------
134 # The following sample illustrates how to extract form fields from PDF documents.
135 #-----------------------------------------------------------------------------------
136
137 # Test if the add-on is installed
138 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Form) then
139 puts ""
140 puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
141 puts "-----------------------------------------------------------------------------"
142 puts "The Data Extraction suite is an optional add-on, available for download"
143 puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
144 puts "downloaded this module, ensure that the SDK is able to find the required files"
145 puts "using the PDFNet.AddResourceSearchPath() function."
146 puts ""
147 else
148 begin
149 # Extract form fields as a JSON file
150 puts "Extract form fields as a JSON file"
151
152 outputFile = $outputPath + "formfields-scanned.json"
153 DataExtractionModule.ExtractData($inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule::E_Form)
154
155 puts "Result saved in " + outputFile
156
157 #------------------------------------------------------
158 # Extract form fields as a JSON string
159 puts "Extract form fields as a JSON string"
160
161 outputFile = $outputPath + "formfields.json"
162 json = DataExtractionModule.ExtractData($inputPath + "formfields.pdf", DataExtractionModule::E_Form)
163 File.open(outputFile, 'w') { |file| file.write(json) }
164
165 puts "Result saved in " + outputFile
166
167 #-----------------------------------------------------------------------------------
168 # Detect and add form fields to a PDF document.
169 # PDF document already has form fields, and this sample will update to the new fields.
170 puts "Extract document structure as a PDF file"
171 doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
172
173 outputFile = $outputPath + "formfields-scanned-fields-new.pdf"
174
175 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
176 doc.Save(outputFile, SDFDoc::E_linearized);
177 doc.Close
178
179 puts "Result saved in " + outputFile
180
181 #-----------------------------------------------------------------------------------
182 # Detect and add form fields to a PDF document.
183 # PDF document already has form fields, and this sample will keep the original fields.
184 puts "Extract document structure as a PDF file"
185 doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
186
187 outputFile = $outputPath + "formfields-scanned-fields-old.pdf"
188
189 options = DataExtractionOptions.new()
190 options.SetOverlappingFormFieldBehavior("KeepOld")
191 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
192 doc.Save(outputFile, SDFDoc::E_linearized);
193 doc.Close
194
195 puts "Result saved in " + outputFile
196
197
198 rescue => error
199 puts "Unable to extract form fields data, error: " + error.message
200 end
201 end
202
203 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_GenericKeyValue) then
204 puts ""
205 puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
206 puts "-----------------------------------------------------------------------------"
207 puts "The Data Extraction suite is an optional add-on, available for download"
208 puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
209 puts "downloaded this module, ensure that the SDK is able to find the required files"
210 puts "using the PDFNet.AddResourceSearchPath() function."
211 puts ""
212 else
213 begin
214 puts "Extract key-value pairs from a PDF"
215 # Simple example: Extract Keys & Values as a JSON file
216 DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val.json", DataExtractionModule::E_GenericKeyValue)
217 puts "Result saved in " + $outputPath + "newsletter_key_val.json"
218
219 # Example with customized options:
220 # Extract Keys & Values from pages 2-4, excluding ads
221 options = DataExtractionOptions.new()
222 options.SetPages("2-4")
223
224 p2_exclusion_zones = RectCollection.new()
225 # Exclude the ad on page 2
226 # These coordinates are in PDF user space, with the origin at the bottom left corner of the page
227 # Coordinates rotate with the page, if it has rotation applied.
228 p2_exclusion_zones.AddRect(Rect.new(166, 47, 562, 222))
229 options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
230
231 p4_inclusion_zones = RectCollection.new()
232 p4_exclusion_zones = RectCollection.new()
233 # Only include the article text for page 4, exclude ads and headings
234 p4_inclusion_zones.AddRect(Rect.new(30, 432, 562, 684))
235 p4_exclusion_zones.AddRect(Rect.new(30, 657, 295, 684))
236 options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
237 options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
238 puts "Extract Key-Value pairs from specific pages and zones as a JSON file"
239 DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule::E_GenericKeyValue, options)
240 puts "Result saved in " + $outputPath + "newsletter_key_val_with_zones.json"
241
242 rescue => error
243 puts "Unable to extract form fields data, error: " + error.message
244 end
245 end
246
247 #-----------------------------------------------------------------------------------
248
249 PDFNet.Terminate
250 puts "Done."
251end
252
253main()
254
1'
2' Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports pdftron
6Imports pdftron.Common
7Imports pdftron.PDF
8Imports pdftron.Filters
9
10' The Data Extraction suite is an optional PDFNet add-on collection that can be used to
11' extract various types of data from PDF documents.
12' The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
13
14Module DataExtractionTestVB
15 Dim pdfNetLoader As PDFNetLoader
16 Sub New()
17 pdfNetLoader = pdftron.PDFNetLoader.Instance()
18 End Sub
19
20 ' Relative path to the folder containing test files.
21 Dim input_path As String = "../../../../TestFiles/"
22 Dim output_path As String = "../../../../TestFiles/Output/"
23
24 Sub Main()
25 PDFNet.Initialize(PDFTronLicense.Key)
26 PDFNet.AddResourceSearchPath("../../../../../Lib/")
27
28 TestTabularData()
29 TestDocumentStructure()
30 TestFormFields()
31 TestGenericKeyValue()
32
33 PDFNet.Terminate()
34 End Sub
35
36
37 ' The following sample illustrates how to extract tables from PDF documents.
38 Sub TestTabularData()
39 ' Test if the add-on is installed
40 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular) Then
41 Console.WriteLine()
42 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
43 Console.WriteLine("---------------------------------------------------------------")
44 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
45 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
46 Console.WriteLine("module, ensure that the SDK is able to find the required files")
47 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
48 Console.WriteLine()
49 Return
50 End If
51
52 Try
53 ' Extract tabular data as a JSON file
54 DataExtractionModule.ExtractData(input_path & "table.pdf", output_path & "table.json", DataExtractionModule.DataExtractionEngine.e_tabular)
55
56 ' Extract tabular data as a JSON string
57 Dim json As String = DataExtractionModule.ExtractData(input_path & "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular)
58 System.IO.File.WriteAllText(output_path & "financial.json", json)
59
60 ' Extract tabular data as an XLSX file
61 DataExtractionModule.ExtractToXLSX(input_path & "table.pdf", output_path & "table.xlsx")
62
63 ' Extract tabular data as an XLSX stream (also known as filter)
64 Dim output_xlsx_stream As MemoryFilter = New MemoryFilter(0, False)
65 DataExtractionModule.ExtractToXLSX(input_path & "financial.pdf", output_xlsx_stream)
66 output_xlsx_stream.SetAsInputFilter()
67 output_xlsx_stream.WriteToFile(output_path & "financial.xlsx", False)
68
69 Catch e As PDFNetException
70 Console.WriteLine(e.Message)
71 End Try
72 End Sub
73
74
75 ' The following sample illustrates how to extract document structure from PDF documents.
76 Sub TestDocumentStructure()
77 ' Test if the add-on is installed
78 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure) Then
79 Console.WriteLine()
80 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.")
81 Console.WriteLine("---------------------------------------------------------------")
82 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
83 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
84 Console.WriteLine("module, ensure that the SDK is able to find the required files")
85 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
86 Console.WriteLine()
87 Return
88 End If
89
90 Try
91 ' Extract document structure as a JSON file
92 DataExtractionModule.ExtractData(input_path & "paragraphs_and_tables.pdf", output_path & "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure)
93
94 ' Extract document structure as a JSON string
95 Dim json As String = DataExtractionModule.ExtractData(input_path & "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure)
96 System.IO.File.WriteAllText(output_path & "tagged.json", json)
97
98 Catch e As PDFNetException
99 Console.WriteLine(e.Message)
100 End Try
101 End Sub
102
103
104 ' The following sample illustrates how to extract form fields from PDF documents.
105 Sub TestFormFields()
106 ' Test if the add-on is installed
107 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form) Then
108 Console.WriteLine()
109 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.")
110 Console.WriteLine("---------------------------------------------------------------")
111 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
112 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
113 Console.WriteLine("module, ensure that the SDK is able to find the required files")
114 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
115 Console.WriteLine()
116 Return
117 End If
118
119 Try
120 ' Extract form fields as a JSON file
121 DataExtractionModule.ExtractData(input_path & "formfields-scanned.pdf", output_path & "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form)
122
123 ' Extract form fields as a JSON string
124 Dim json As String = DataExtractionModule.ExtractData(input_path & "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form)
125 System.IO.File.WriteAllText(output_path & "formfields.json", json)
126
127 ' Detect and add form fields to a PDF document.
128 ' PDF document already has form fields, and this sample will update to new found fields.
129 Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
130 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
131 doc.Save(output_path & "formfields-scanned-fields-new.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
132 End Using
133
134 ' Detect and add form fields to a PDF document.
135 ' PDF document already has form fields, and this sample will keep the original fields.
136 Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
137 Dim options = New DataExtractionOptions()
138 options.SetOverlappingFormFieldBehavior("KeepOld")
139 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
140 doc.Save(output_path & "formfields-scanned-fields-old.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
141 End Using
142
143 Catch e As PDFNetException
144 Console.WriteLine(e.Message)
145 End Try
146
147 End Sub
148
149 ' The following sample illustrates how to extract key-value pairs from PDF documents.
150 Sub TestGenericKeyValue()
151 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value) Then
152 Console.WriteLine()
153 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
154 Console.WriteLine("---------------------------------------------------------------")
155 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
156 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
157 Console.WriteLine("module, ensure that the SDK is able to find the required files")
158 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
159 Console.WriteLine()
160 Return
161 End If
162
163 ' Simple example: Extract Keys & Values as a JSON file
164 DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value)
165
166 ' Example with customized options:
167 ' Extract Keys & Values from pages 2-4, excluding ads
168 Dim options As New DataExtractionOptions()
169 options.SetPages("2-4")
170
171 Dim p2ExclusionZones As New RectCollection()
172 ' Exclude the ad on page 2
173 ' These coordinates are in PDF user space, with the origin at the bottom left corner of the page
174 ' Coordinates rotate with the page, if it has rotation applied.
175 p2ExclusionZones.AddRect(166, 47, 562, 222)
176 options.AddExclusionZonesForPage(p2ExclusionZones, 2)
177
178 Dim p4InclusionZones As New RectCollection()
179 Dim p4ExclusionZones As New RectCollection()
180 ' Only include the article text for page 4, exclude ads and headings
181 p4InclusionZones.AddRect(30, 432, 562, 684)
182 p4ExclusionZones.AddRect(30, 657, 295, 684)
183 options.AddInclusionZonesForPage(p4InclusionZones, 4)
184 options.AddExclusionZonesForPage(p4ExclusionZones, 4)
185
186 DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val_with_zones.json",DataExtractionModule.DataExtractionEngine.e_generic_key_value, options)
187 End Sub
188
189End Module
190
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales