Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.
Looking for data extraction + WebViewer UI? Check out our Document Structure Extraction - Showcase Sample Code
Learn more about our Server SDK and Smart Data Extraction.
To use this feature in production, your license key will need theĀ Smart Data Extraction Package. Trial keys already include all packages.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7
8using pdftron;
9using pdftron.Common;
10using pdftron.PDF;
11using pdftron.SDF;
12using pdftron.Filters;
13
14namespace DataExtractionTestCS
15{
16 /// <summary>
17 ///---------------------------------------------------------------------------------------
18 /// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
19 /// extract various types of data from PDF documents.
20 ///
21 /// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
22 //---------------------------------------------------------------------------------------
23 /// </summary>
24 class Class1
25 {
26 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
27 static Class1() { }
28
29 // Relative path to the folder containing test files.
30 static string input_path = "../../../../TestFiles/";
31 static string output_path = "../../../../TestFiles/Output/";
32
33
34 /// <summary>
35 /// The following sample illustrates how to extract tables from PDF documents.
36 /// </summary>
37 static void TestTabularData()
38 {
39 // Test if the add-on is installed
40 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
41 {
42 Console.WriteLine();
43 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
46 Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine();
50 return;
51 }
52
53 try
54 {
55 // Extract tabular data as a JSON file
56 DataExtractionModule.ExtractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
57
58 // Extract tabular data as a JSON string
59 string json = DataExtractionModule.ExtractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
60 System.IO.File.WriteAllText(output_path + "financial.json", json);
61
62 // Extract tabular data as an XLSX file
63 DataExtractionModule.ExtractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
64
65 // Extract tabular data as an XLSX stream (also known as filter)
66 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
67 DataExtractionModule.ExtractToXLSX(input_path + "financial.pdf", output_xlsx_stream);
68 output_xlsx_stream.SetAsInputFilter();
69 output_xlsx_stream.WriteToFile(output_path + "financial.xlsx", false);
70 }
71 catch (PDFNetException e)
72 {
73 Console.WriteLine(e.Message);
74 }
75 }
76
77
78 /// <summary>
79 // The following sample illustrates how to extract document structure from PDF documents.
80 /// </summary>
81 static void TestDocumentStructure()
82 {
83 // Test if the add-on is installed
84 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
85 {
86 Console.WriteLine();
87 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
88 Console.WriteLine("---------------------------------------------------------------");
89 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
90 Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
91 Console.WriteLine("module, ensure that the SDK is able to find the required files");
92 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
93 Console.WriteLine();
94 return;
95 }
96
97 try
98 {
99 // Extract document structure as a JSON file
100 DataExtractionModule.ExtractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
101
102 // Extract document structure as a JSON string
103 string json = DataExtractionModule.ExtractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
104 System.IO.File.WriteAllText(output_path + "tagged.json", json);
105 }
106 catch (PDFNetException e)
107 {
108 Console.WriteLine(e.Message);
109 }
110 }
111
112
113 /// <summary>
114 // The following sample illustrates how to extract form fields from PDF documents.
115 /// </summary>
116 static void TestFormFields()
117 {
118 // Test if the add-on is installed
119 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
120 {
121 Console.WriteLine();
122 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
123 Console.WriteLine("---------------------------------------------------------------");
124 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
125 Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
126 Console.WriteLine("module, ensure that the SDK is able to find the required files");
127 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
128 Console.WriteLine();
129 return;
130 }
131
132 try
133 {
134 // Extract form fields as a JSON file
135 DataExtractionModule.ExtractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
136
137 // Extract form fields as a JSON string
138 string json = DataExtractionModule.ExtractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
139 System.IO.File.WriteAllText(output_path + "formfields.json", json);
140
141 // Detect and add form fields to a PDF document.
142 // PDF document already has form fields, and this sample will update to new found fields.
143 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
144 {
145 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc);
146 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveOptions.e_linearized);
147 }
148
149 // Detect and add form fields to a PDF document.
150 // PDF document already has form fields, and this sample will keep the original fields.
151 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
152 {
153 DataExtractionOptions options = new DataExtractionOptions();
154 options.SetOverlappingFormFieldBehavior("KeepOld");
155
156 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options);
157 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveOptions.e_linearized);
158 }
159 }
160 catch (PDFNetException e)
161 {
162 Console.WriteLine(e.Message);
163 }
164 }
165
166 /// <summary>
167 // The following sample illustrates how to extract key-value pairs from PDF documents.
168 /// </summary>
169 static void TestGenericKeyValue()
170 {
171 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value))
172 {
173 Console.WriteLine();
174 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
175 Console.WriteLine("---------------------------------------------------------------");
176 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
177 Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
178 Console.WriteLine("module, ensure that the SDK is able to find the required files");
179 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
180 Console.WriteLine();
181 return;
182 }
183
184 try
185 {
186 // Simple example: Extract Keys & Values as a JSON file
187 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
188
189 // Example with customized options:
190 // Extract Keys & Values from pages 2-4, excluding ads
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.SetPages("2-4");
193
194 RectCollection p2ExclusionZones = new RectCollection();
195 // Exclude the add-on on page 2
196 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
197 // Coordinates rotate with the page, if it has rotation applied.
198 p2ExclusionZones.AddRect(166, 47, 562, 222);
199 options.AddExclusionZonesForPage(p2ExclusionZones, 2);
200
201 RectCollection p4InclusionZones = new RectCollection();
202 RectCollection p4ExclusionZones = new RectCollection();
203 // Only include the article text for page 4, exclude ads and headings
204 p4InclusionZones.AddRect(30, 432, 562, 684);
205 p4ExclusionZones.AddRect(30, 657, 295, 684);
206 options.AddInclusionZonesForPage(p4InclusionZones, 4);
207 options.AddExclusionZonesForPage(p4ExclusionZones, 4);
208
209 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
210 }
211 catch (PDFNetException e)
212 {
213 Console.WriteLine(e.Message);
214 }
215 }
216
217
218
219 /// <summary>
220 // The following sample illustrates how to extract document classes from PDF documents.
221 /// </summary>
222 static void TestDocClassifier()
223 {
224 // Test if the add-on is installed
225 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_classification))
226 {
227 Console.WriteLine();
228 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
229 Console.WriteLine("---------------------------------------------------------------");
230 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
231 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
232 Console.WriteLine("module, ensure that the SDK is able to find the required files");
233 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
234 Console.WriteLine();
235 return;
236 }
237
238 try
239 {
240 // Simple example: classify pages as a JSON file
241 DataExtractionModule.ExtractData(input_path + "Invoice.pdf", output_path + "Invoice_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification);
242
243 // Classify pages as a JSON string
244 string json = DataExtractionModule.ExtractData(input_path + "Scientific_Publication.pdf", DataExtractionModule.DataExtractionEngine.e_doc_classification);
245 System.IO.File.WriteAllText(output_path + "Scientific_Publication_Classified.json", json);
246
247 // Example with customized options:
248 DataExtractionOptions options = new DataExtractionOptions();
249 // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
250 options.SetMinimumConfidenceThreshold(0.7);
251 DataExtractionModule.ExtractData(input_path + "Email.pdf", output_path + "Email_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification, options);
252 }
253 catch (PDFNetException e)
254 {
255 Console.WriteLine(e.Message);
256 }
257 }
258
259
260 /// <summary>
261 /// The main entry point for the application.
262 /// </summary>
263 static void Main(string[] args)
264 {
265 // The first step in every application using PDFNet is to initialize the
266 // library and set the path to common PDF resources. The library is usually
267 // initialized only once, but calling Initialize() multiple times is also fine.
268 PDFNet.Initialize(PDFTronLicense.Key);
269 PDFNet.AddResourceSearchPath("../../../../../Lib/");
270
271 TestTabularData();
272 TestDocumentStructure();
273 TestFormFields();
274 TestGenericKeyValue();
275 TestDocClassifier();
276
277 PDFNet.Terminate();
278 }
279 }
280}
281
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30 ofstream out_file(filename.c_str(), ofstream::binary);
31 string out_buf = text.ConvertToUtf8();
32 out_file.write(out_buf.c_str(), out_buf.size());
33 out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45 // Test if the add-on is installed
46 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47 {
48 cout << endl;
49 cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50 cout << "---------------------------------------------------------------" << endl;
51 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
53 cout << "module, ensure that the SDK is able to find the required files" << endl;
54 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55 return;
56 }
57
58 // Extract tabular data as a JSON file
59 DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61 // Extract tabular data as a JSON string
62 UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63 WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65 // Extract tabular data as an XLSX file
66 DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68 // Extract tabular data as an XLSX stream (also known as filter)
69 MemoryFilter output_xlsx_stream(0, false);
70 DataExtractionOptions options;
71 options.SetPages("1"); // extract page 1
72 DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73 output_xlsx_stream.SetAsInputFilter();
74 output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82 // Test if the add-on is installed
83 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84 {
85 cout << endl;
86 cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87 cout << "---------------------------------------------------------------" << endl;
88 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
90 cout << "module, ensure that the SDK is able to find the required files" << endl;
91 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92 return;
93 }
94
95 // Extract document structure as a JSON file
96 DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98 // Extract document structure as a JSON string
99 UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100 WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108 // Test if the add-on is installed
109 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110 {
111 cout << endl;
112 cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113 cout << "---------------------------------------------------------------" << endl;
114 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
116 cout << "module, ensure that the SDK is able to find the required files" << endl;
117 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118 return;
119 }
120
121 // Extract form fields as a JSON file
122 DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124 // Extract form fields as a JSON string
125 UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126 WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128 //---------------------------------------------------------------------------------------
129 // Detect and add form fields to a PDF document.
130 // PDF document already has form fields, and this sample will update to new found fields.
131 //---------------------------------------------------------------------------------------
132 {
133 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137 // Save the modfied pdf document
138 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139 }
140
141 //---------------------------------------------------------------------------------------
142 // Detect and add form fields to a PDF document.
143 // PDF document already has form fields, and this sample will keep the original fields.
144 //---------------------------------------------------------------------------------------
145 {
146 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148 // Setup DataExtractionOptions to keep old fields
149 DataExtractionOptions options;
150 options.SetOverlappingFormFieldBehavior("KeepOld");
151
152 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154 // Save the modfied pdf document
155 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156 }
157}
158
159//---------------------------------------------------------------------------------------
160// The following sample illustrates how to extract key-value pairs from PDF documents.
161//---------------------------------------------------------------------------------------
162void TestGenericKeyValue() {
163
164 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue))
165 {
166 cout << endl;
167 cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
168 cout << "---------------------------------------------------------------" << endl;
169 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
170 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
171 cout << "module, ensure that the SDK is able to find the required files" << endl;
172 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
173 return;
174 }
175
176 // Simple example: Extract Keys & Values as a JSON file
177 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val.json"), DataExtractionModule::e_GenericKeyValue);
178
179 // Example with customized options:
180 // Extract Keys & Values from pages 2-4, excluding ads
181 DataExtractionOptions options;
182 options.SetPages("2-4");
183 RectCollection p2_exclusion_zones;
184 // Exclude the add-on on page 2
185 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
186 // Coordinates rotate with the page, if it has rotation applied.
187 p2_exclusion_zones.AddRect(166, 47, 562, 222);
188 options.AddExclusionZonesForPage(p2_exclusion_zones, 2);
189
190 RectCollection p4_inclusion_zones, p4_exclusion_zones;
191 // Only include the article text for page 4, exclude ads and headings
192 p4_inclusion_zones.AddRect(30, 432, 562, 684);
193 p4_exclusion_zones.AddRect(30, 657, 295, 684);
194 options.AddInclusionZonesForPage(p4_inclusion_zones, 4);
195 options.AddExclusionZonesForPage(p4_exclusion_zones, 4);
196
197 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val_with_zones.json"), DataExtractionModule::e_GenericKeyValue, &options);
198}
199
200//---------------------------------------------------------------------------------------
201// The following sample illustrates how to extract document classes from PDF documents.
202//---------------------------------------------------------------------------------------
203void TestDocClassifier()
204{
205 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification))
206 {
207 cout << endl;
208 cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
209 cout << "---------------------------------------------------------------" << endl;
210 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
211 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
212 cout << "module, ensure that the SDK is able to find the required files" << endl;
213 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
214 return;
215 }
216
217 // Simple example: classify pages as a JSON file
218 DataExtractionModule::ExtractData(input_path + UString("Invoice.pdf"), output_path + UString("Invoice_Classified.json"), DataExtractionModule::e_DocClassification);
219
220 // Classify pages as a JSON string
221 UString json = DataExtractionModule::ExtractData(input_path + UString("Scientific_Publication.pdf"), DataExtractionModule::e_DocClassification);
222 WriteTextToFile((output_path + "Scientific_Publication_Classified.json").c_str(), json);
223
224 // Example with customized options:
225 DataExtractionOptions options;
226 // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
227 options.SetMinimumConfidenceThreshold(0.7);
228 DataExtractionModule::ExtractData(input_path + UString("Email.pdf"), output_path + UString("Email_Classified.json"), DataExtractionModule::e_DocClassification, &options);
229}
230
231int main(int argc, char* argv[])
232{
233 // The first step in every application using PDFNet is to initialize the
234 // library and set the path to common PDF resources. The library is usually
235 // initialized only once, but calling Initialize() multiple times is also fine.
236 PDFNet::Initialize(LicenseKey);
237
238 int ret = 0;
239
240 try
241 {
242 PDFNet::AddResourceSearchPath("../../../Lib/");
243
244 TestTabularData();
245 TestDocumentStructure();
246 TestFormFields();
247 TestGenericKeyValue();
248 TestDocClassifier();
249 }
250 catch (Common::Exception& e)
251 {
252 cout << e << endl;
253 ret = 1;
254 }
255 catch (...)
256 {
257 cout << "Unknown Exception" << endl;
258 ret = 1;
259 }
260
261 PDFNet::Terminate();
262
263 return ret;
264}
265
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "testing"
10 "os"
11 "flag"
12 . "github.com/pdftron/pdftron-go/v2"
13)
14
15var licenseKey string
16var modulePath string
17
18func init() {
19 flag.StringVar(&licenseKey, "license", "", "License key for Apryse SDK")
20 flag.StringVar(&modulePath, "modulePath", "", "Path for downloaded modules")
21}
22
23//---------------------------------------------------------------------------------------
24// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
25// extract various types of data from PDF documents.
26//
27// The Apryse SDK Data Extraction suite can be downloaded from
28// https://docs.apryse.com/core/guides/info/modules#data-extraction-module
29//
30// Please contact us if you have any questions.
31//---------------------------------------------------------------------------------------
32
33// Relative path to the folder containing the test files.
34var inputPath = "../TestFiles/"
35var outputPath = "../TestFiles/Output/"
36
37//---------------------------------------------------------------------------------------
38
39func catch(err *error) {
40 if r := recover(); r != nil {
41 *err = fmt.Errorf("%v", r)
42 }
43}
44
45//---------------------------------------------------------------------------------------
46
47func WriteTextToFile(outputFile string, text string) {
48 f, err := os.Create(outputFile)
49 if err != nil {
50 fmt.Println(err)
51 }
52
53 defer f.Close()
54
55 _, err2 := f.WriteString(text)
56 if err2 != nil {
57 fmt.Println(err2)
58 }
59}
60
61//---------------------------------------------------------------------------------------
62// The following sample illustrates how to extract tables from PDF documents.
63//---------------------------------------------------------------------------------------
64
65func TabularDataTest() (err error) {
66 defer catch(&err)
67
68 PDFNetAddResourceSearchPath(modulePath)
69
70 // Test if the add-on is installed
71 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Tabular) {
72 fmt.Println("")
73 fmt.Println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
74 fmt.Println("-----------------------------------------------------------------------------")
75 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
76 fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
77 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
78 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
79 fmt.Println("")
80 return nil
81 }
82
83 // Extract tabular data as a JSON file
84 fmt.Println("Extract tabular data as a JSON file")
85
86 inputFile := inputPath + "table.pdf"
87 outputFile := outputPath + "table.json"
88 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Tabular)
89
90 fmt.Println("Result saved in " + outputFile)
91
92 // Extract tabular data as a JSON string
93 fmt.Println("Extract tabular data as a JSON string")
94
95 inputFile = inputPath + "financial.pdf"
96 outputFile = outputPath + "financial.json"
97
98 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Tabular).(string)
99 WriteTextToFile(outputFile, json)
100
101 fmt.Println("Result saved in " + outputFile)
102
103 // Extract tabular data as an XLSX file
104 fmt.Println("Extract tabular data as an XLSX file")
105
106 inputFile = inputPath + "table.pdf"
107 outputFile = outputPath + "table.xlsx"
108 DataExtractionModuleExtractToXLSX(inputFile, outputFile)
109
110 fmt.Println("Result saved in " + outputFile)
111
112 // Extract tabular data as an XLSX stream (also known as filter)
113 fmt.Println("Extract tabular data as an XLSX stream")
114
115 inputFile = inputPath + "financial.pdf"
116 outputFile = outputPath + "financial.xlsx"
117 outputXlsxStream := NewMemoryFilter(0, false)
118 outputFilter := NewFilter(outputXlsxStream)
119 options := NewDataExtractionOptions()
120 options.SetPages("1"); // page 1
121 DataExtractionModuleExtractToXLSX(inputFile, outputFilter, options)
122 outputXlsxStream.SetAsInputFilter()
123 outputXlsxStream.WriteToFile(outputFile, false)
124
125 fmt.Println("Result saved in " + outputFile)
126
127 return nil
128}
129
130//---------------------------------------------------------------------------------------
131// The following sample illustrates how to extract document structure from PDF documents.
132//---------------------------------------------------------------------------------------
133
134func DocumentStructureTest() (err error) {
135 defer catch(&err)
136
137 // Test if the add-on is installed
138 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocStructure) {
139 fmt.Println("")
140 fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
141 fmt.Println("-----------------------------------------------------------------------------")
142 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
143 fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
144 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
145 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
146 fmt.Println("")
147 return nil
148 }
149
150 // Extract document structure as a JSON file
151 fmt.Println("Extract document structure as a JSON file")
152
153 inputFile := inputPath + "paragraphs_and_tables.pdf"
154 outputFile := outputPath + "paragraphs_and_tables.json"
155 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocStructure)
156
157 fmt.Println("Result saved in " + outputFile)
158
159 // Extract document structure as a JSON string
160 fmt.Println("Extract document structure as a JSON string")
161
162 inputFile = inputPath + "tagged.pdf"
163 outputFile = outputPath + "tagged.json"
164 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocStructure).(string)
165 WriteTextToFile(outputFile, json)
166
167 fmt.Println("Result saved in " + outputFile)
168
169 return nil
170}
171
172//---------------------------------------------------------------------------------------
173// The following sample illustrates how to extract form fields from PDF documents.
174//---------------------------------------------------------------------------------------
175
176func FormFieldsTest() (err error) {
177 defer catch(&err)
178
179 // Test if the add-on is installed
180 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Form) {
181 fmt.Println("")
182 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
183 fmt.Println("-----------------------------------------------------------------------------")
184 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
185 fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
186 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
187 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
188 fmt.Println("")
189 return nil
190 }
191
192 // Extract form fields as a JSON file
193 fmt.Println("Extract form fields as a JSON file")
194
195 inputFile := inputPath + "formfields-scanned.pdf"
196 outputFile := outputPath + "formfields-scanned.json"
197 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Form)
198
199 fmt.Println("Result saved in " + outputFile)
200
201 // Extract form fields as a JSON string
202 fmt.Println("Extract form fields as a JSON string")
203
204 inputFile = inputPath + "formfields.pdf"
205 outputFile = outputPath + "formfields.json"
206
207 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Form).(string)
208 WriteTextToFile(outputFile, json)
209
210 fmt.Println("Result saved in " + outputFile)
211
212 //////////////////////////////////////////////////////////////////////////
213 // Detect and add form fields to a PDF document.
214 // PDF document already has form fields, and this sample will update to new found fields.
215 doc := NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
216
217 fmt.Println("Extract form fields as a PDF file, keep new fields")
218 DataExtractionModuleDetectAndAddFormFieldsToPDF(doc)
219
220 outputFile = outputPath + "formfields-scanned-fields-new.pdf"
221 doc.Save(outputFile, uint(SDFDocE_linearized))
222 doc.Close()
223
224 fmt.Println("Result saved in " + outputFile)
225
226 //////////////////////////////////////////////////////////////////////////
227 // Detect and add form fields to a PDF document.
228 // PDF document already has form fields, and this sample will keep the original fields.
229 doc = NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
230
231 // Setup DataExtractionOptions to keep old fields
232 options := NewDataExtractionOptions()
233 options.SetOverlappingFormFieldBehavior("KeepOld")
234
235 fmt.Println("Extract form fields as a PDF file, keep old fields")
236 DataExtractionModuleDetectAndAddFormFieldsToPDF(doc, options)
237
238 outputFile = outputPath + "formfields-scanned-fields-old.pdf"
239 doc.Save(outputFile, uint(SDFDocE_linearized))
240 doc.Close()
241
242 fmt.Println("Result saved in " + outputFile)
243
244 return nil
245}
246
247//---------------------------------------------------------------------------------------
248// The following sample illustrates how to extract key-value pairs from PDF documents.
249//---------------------------------------------------------------------------------------
250
251func GenericKeyValueTest() (err error) {
252 defer catch(&err)
253
254 // Test if the add-on is installed
255 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_GenericKeyValue) {
256 fmt.Println("")
257 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
258 fmt.Println("-----------------------------------------------------------------------------")
259 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
260 fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
261 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
262 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
263 fmt.Println("")
264 return nil
265 }
266
267 fmt.Println("Extract key-value pairs from a PDF")
268
269 inputFile := inputPath + "newsletter.pdf"
270 outputFile := outputPath + "newsletter_key_val.json"
271 // Simple example: Extract Keys & Values as a JSON file
272 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue)
273
274 fmt.Println("Result saved in " + outputFile)
275
276 // Example with customized options:
277 // Extract Keys & Values from pages 2-4, excluding ads
278 options := NewDataExtractionOptions()
279 options.SetPages("2-4")
280
281 p2ExclusionZones := NewRectCollection()
282 // Exclude the add-on on page 2
283 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
284 // Coordinates rotate with the page, if it has rotation applied.
285 p2ExclusionZones.AddRect(NewRect(166, 47, 562, 222))
286 options.AddExclusionZonesForPage(p2ExclusionZones, 2)
287
288 p4InclusionZones := NewRectCollection()
289 p4ExclusionZones := NewRectCollection()
290 // Only include the article text for page 4, exclude ads and headings
291 p4InclusionZones.AddRect(NewRect(30, 432, 562, 684))
292 p4ExclusionZones.AddRect(NewRect(30, 657, 295, 684))
293 options.AddInclusionZonesForPage(p4InclusionZones, 4)
294 options.AddExclusionZonesForPage(p4ExclusionZones, 4)
295
296 fmt.Println("Extract Key-Value pairs from specific pages and zones as a JSON file")
297 outputFile = outputPath + "newsletter_key_val_with_zones.json"
298 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue, options)
299
300 fmt.Println("Result saved in " + outputFile)
301
302 return nil
303}
304
305//---------------------------------------------------------------------------------------
306// The following sample illustrates how to extract document classes from PDF documents.
307//---------------------------------------------------------------------------------------
308
309func DocClassifierTest() (err error) {
310 defer catch(&err)
311
312 // Test if the add-on is installed
313 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocClassification) {
314 fmt.Println("")
315 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
316 fmt.Println("-----------------------------------------------------------------------------")
317 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
318 fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
319 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
320 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
321 fmt.Println("")
322 return nil
323 }
324
325 // Simple example: classify pages as a JSON file
326 fmt.Println("Classify pages as a JSON file")
327
328 inputFile := inputPath + "Invoice.pdf"
329 outputFile := outputPath + "Invoice_Classified.json"
330 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification)
331
332 fmt.Println("Result saved in " + outputFile)
333
334 // Classify pages as a JSON string
335 fmt.Println("Classify pages as a JSON string")
336
337 inputFile = inputPath + "Scientific_Publication.pdf"
338 outputFile = outputPath + "Scientific_Publication_Classified.json"
339 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocClassification).(string)
340 WriteTextToFile(outputFile, json)
341
342 fmt.Println("Result saved in " + outputFile)
343
344 // Example with customized options:
345 fmt.Println("Classify pages with customized options")
346
347 inputFile = inputPath + "Email.pdf"
348 outputFile = outputPath + "Email_Classified.json"
349 options := NewDataExtractionOptions()
350 // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
351 options.SetMinimumConfidenceThreshold(0.7)
352 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification, options)
353
354 fmt.Println("Result saved in " + outputFile)
355
356 return nil
357}
358
359//---------------------------------------------------------------------------------------
360
361func TestDataExtraction(t *testing.T) {
362 // The first step in every application using PDFNet is to initialize the
363 // library. The library is usually initialized only once, but calling
364 // Initialize() multiple times is also fine.
365 PDFNetInitialize(licenseKey)
366
367 //-----------------------------------------------------------------------------------
368
369 PDFNetAddResourceSearchPath("../../../PDFNetC/Lib/")
370
371 //-----------------------------------------------------------------------------------
372
373 err := TabularDataTest()
374 if err != nil {
375 fmt.Println(fmt.Errorf("Unable to extract tabular data, error: %s", err))
376 }
377
378 //-----------------------------------------------------------------------------------
379
380 err = DocumentStructureTest()
381 if err != nil {
382 fmt.Println(fmt.Errorf("Unable to extract document structure data, error: %s", err))
383 }
384
385 //-----------------------------------------------------------------------------------
386
387 err = FormFieldsTest()
388 if err != nil {
389 fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
390 }
391
392 err = GenericKeyValueTest()
393 if err != nil {
394 fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
395 }
396
397 //-----------------------------------------------------------------------------------
398
399 err = DocClassifierTest()
400 if err != nil {
401 fmt.Println(fmt.Errorf("Unable to extract document classifications, error: %s", err))
402 }
403
404 //-----------------------------------------------------------------------------------
405
406 PDFNetTerminate()
407 fmt.Println("Done.")
408}
409
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.io.FileWriter;
7import java.io.BufferedWriter;
8import java.io.FileNotFoundException;
9import java.io.IOException;
10
11import com.pdftron.common.PDFNetException;
12import com.pdftron.pdf.*;
13import com.pdftron.filters.*;
14import com.pdftron.sdf.SDFDoc;
15
16//---------------------------------------------------------------------------------------
17// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18// extract various types of data from PDF documents.
19//
20// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
21//---------------------------------------------------------------------------------------
22
23public class DataExtractionTest {
24
25 static void writeTextToFile(String filename, String text) throws IOException
26 {
27 BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
28 writer.write(text);
29 writer.close();
30 }
31
32 //---------------------------------------------------------------------------------------
33 // The following sample illustrates how to extract tables from PDF documents.
34 //---------------------------------------------------------------------------------------
35 static void testTabularData()
36 {
37 try {
38 // Test if the add-on is installed
39 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
40 {
41 System.out.println();
42 System.out.println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
43 System.out.println("---------------------------------------------------------------");
44 System.out.println("The Data Extraction suite is an optional add-on, available for download");
45 System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
46 System.out.println("module, ensure that the SDK is able to find the required files");
47 System.out.println("using the PDFNet.addResourceSearchPath() function." );
48 System.out.println();
49 return;
50 }
51 } catch (PDFNetException e) {
52 System.out.println("Data Extraction module not available, error:");
53 e.printStackTrace();
54 System.out.println(e);
55 }
56
57 // Relative path to the folder containing test files.
58 String input_path = "../../TestFiles/";
59 String output_path = "../../TestFiles/Output/";
60
61 try {
62 // Extract tabular data as a JSON file
63 DataExtractionModule.extractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
64
65 // Extract tabular data as a JSON string
66 String json = DataExtractionModule.extractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
67 writeTextToFile(output_path + "financial.json", json);
68
69 // Extract tabular data as an XLSX file
70 DataExtractionModule.extractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
71
72 // Extract tabular data as an XLSX stream (also known as filter)
73 DataExtractionOptions options = new DataExtractionOptions();
74 options.setPages("1");
75 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
76 DataExtractionModule.extractToXLSX(input_path + "financial.pdf", output_xlsx_stream, options);
77 output_xlsx_stream.setAsInputFilter();
78 output_xlsx_stream.writeToFile(output_path + "financial.xlsx", false);
79
80 } catch (PDFNetException e) {
81 System.out.println(e);
82 }
83 catch (IOException e) {
84 System.out.println(e);
85 }
86 }
87
88 //---------------------------------------------------------------------------------------
89 // The following sample illustrates how to extract document structure from PDF documents.
90 //---------------------------------------------------------------------------------------
91 static void testDocumentStructure()
92 {
93 // Test if the add-on is installed
94 try {
95 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
96 {
97 System.out.println();
98 System.out.println("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
99 System.out.println("---------------------------------------------------------------");
100 System.out.println("The Data Extraction suite is an optional add-on, available for download");
101 System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
102 System.out.println("module, ensure that the SDK is able to find the required files");
103 System.out.println("using the PDFNet.addResourceSearchPath() function." );
104 System.out.println();
105 return;
106 }
107 } catch (PDFNetException e) {
108 System.out.println("Data Extraction module not available, error:");
109 e.printStackTrace();
110 System.out.println(e);
111 }
112
113 // Relative path to the folder containing test files.
114 String input_path = "../../TestFiles/";
115 String output_path = "../../TestFiles/Output/";
116
117 try {
118 // Extract document structure as a JSON file
119 DataExtractionModule.extractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
120
121 // Extract document structure as a JSON string
122 String json = DataExtractionModule.extractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
123 writeTextToFile(output_path + "tagged.json", json);
124
125 } catch (PDFNetException e) {
126 System.out.println(e);
127 }
128 catch (IOException e) {
129 System.out.println(e);
130 }
131 }
132
133 //---------------------------------------------------------------------------------------
134 // The following sample illustrates how to extract form fields from PDF documents.
135 //---------------------------------------------------------------------------------------
136 static void testFormFields()
137 {
138 try {
139 // Test if the add-on is installed
140 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
141 {
142 System.out.println();
143 System.out.println("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
144 System.out.println("---------------------------------------------------------------");
145 System.out.println("The Data Extraction suite is an optional add-on, available for download");
146 System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
147 System.out.println("module, ensure that the SDK is able to find the required files");
148 System.out.println("using the PDFNet.addResourceSearchPath() function." );
149 System.out.println();
150 return;
151 }
152 } catch (PDFNetException e) {
153 System.out.println("Data Extraction module not available, error:");
154 e.printStackTrace();
155 System.out.println(e);
156 }
157
158 // Relative path to the folder containing test files.
159 String input_path = "../../TestFiles/";
160 String output_path = "../../TestFiles/Output/";
161
162 try {
163 // Extract form fields as a JSON file
164 DataExtractionModule.extractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
165
166 // Extract form fields as a JSON string
167 String json = DataExtractionModule.extractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
168 writeTextToFile(output_path + "formfields.json", json);
169
170 //---------------------------------------------------------------------------------------
171 // Detect and add form fields to a PDF document.
172 // PDF document already has form fields, and this sample will update to new found fields.
173 //---------------------------------------------------------------------------------------
174 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
175 {
176 DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
177
178 // Save the modfied pdf document
179 doc.save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveMode.LINEARIZED, null);
180 } catch (Exception e) {
181 e.printStackTrace();
182 }
183
184 //---------------------------------------------------------------------------------------
185 // Detect and add form fields to a PDF document.
186 // PDF document already has form fields, and this sample will keep the original fields.
187 //---------------------------------------------------------------------------------------
188 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
189 {
190 // Setup DataExtractionOptions to keep old fields
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.setOverlappingFormFieldBehavior("KeepOld");
193
194 DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
195
196 // Save the modfied pdf document
197 doc.save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198 } catch (Exception e) {
199 e.printStackTrace();
200 }
201
202 } catch (PDFNetException e) {
203 System.out.println(e);
204 }
205 catch (IOException e) {
206 System.out.println(e);
207 }
208 }
209
210 //---------------------------------------------------------------------------------------
211 // The following sample illustrates how to extract key-value pairs from PDF documents.
212 //---------------------------------------------------------------------------------------
213 public static void testGenericKeyValue() {
214 try {
215 // Test if the add-on is installed
216 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
217 {
218 System.out.println();
219 System.out.println("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
220 System.out.println("---------------------------------------------------------------");
221 System.out.println("The Data Extraction suite is an optional add-on, available for download");
222 System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
223 System.out.println("module, ensure that the SDK is able to find the required files");
224 System.out.println("using the PDFNet.addResourceSearchPath() function." );
225 System.out.println();
226 return;
227 }
228 } catch (PDFNetException e) {
229 System.out.println("Data Extraction module not available, error:");
230 e.printStackTrace();
231 System.out.println(e);
232 }
233
234 // Relative path to the folder containing test files.
235 String input_path = "../../TestFiles/";
236 String output_path = "../../TestFiles/Output/";
237
238 try {
239
240 // Simple example: Extract Keys & Values as a JSON file
241 DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
242
243 // Example with customized options:
244 // Extract Keys & Values from pages 2-4, excluding ads
245 DataExtractionOptions options = new DataExtractionOptions();
246 options.setPages("2-4");
247
248 RectCollection p2ExclusionZones = new RectCollection();
249 // Exclude the add-on on page 2
250 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
251 // Coordinates rotate with the page, if it has rotation applied.
252 p2ExclusionZones.addRect(166, 47, 562, 222);
253 options.addExclusionZonesForPage(p2ExclusionZones, 2);
254
255 RectCollection p4InclusionZones = new RectCollection();
256 RectCollection p4ExclusionZones = new RectCollection();
257 // Only include the article text for page 4, exclude ads and headings
258 p4InclusionZones.addRect(30, 432, 562, 684);
259 p4ExclusionZones.addRect(30, 657, 295, 684);
260 options.addInclusionZonesForPage(p4InclusionZones, 4);
261 options.addExclusionZonesForPage(p4ExclusionZones, 4);
262
263 DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
264
265 } catch (Exception e) {
266 System.out.println(e);
267 }
268 }
269
270 //---------------------------------------------------------------------------------------
271 // The following sample illustrates how to extract document classes from PDF documents.
272 //---------------------------------------------------------------------------------------
273 public static void testDocClassifier() {
274 try {
275 // Test if the add-on is installed
276 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_classification))
277 {
278 System.out.println();
279 System.out.println("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
280 System.out.println("---------------------------------------------------------------");
281 System.out.println("The Data Extraction suite is an optional add-on, available for download");
282 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
283 System.out.println("module, ensure that the SDK is able to find the required files");
284 System.out.println("using the PDFNet.addResourceSearchPath() function." );
285 System.out.println();
286 return;
287 }
288 } catch (PDFNetException e) {
289 System.out.println("Data Extraction module not available, error:");
290 e.printStackTrace();
291 System.out.println(e);
292 }
293
294 // Relative path to the folder containing test files.
295 String input_path = "../../TestFiles/";
296 String output_path = "../../TestFiles/Output/";
297
298 try {
299
300 // Simple example: classify pages as a JSON file
301 DataExtractionModule.extractData(input_path + "Invoice.pdf", output_path + "Invoice_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification);
302
303 // Classify pages as a JSON string
304 String json = DataExtractionModule.extractData(input_path + "Scientific_Publication.pdf", DataExtractionModule.DataExtractionEngine.e_doc_classification);
305 writeTextToFile(output_path + "Scientific_Publication_Classified.json", json);
306
307 // Example with customized options:
308 DataExtractionOptions options = new DataExtractionOptions();
309 // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
310 options.setMinimumConfidenceThreshold(0.7);
311 DataExtractionModule.extractData(input_path + "Email.pdf", output_path + "Email_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification, options);
312
313 } catch (Exception e) {
314 System.out.println(e);
315 }
316 }
317
318 public static void main(String[] args)
319 {
320 // The first step in every application using PDFNet is to initialize the
321 // library and set the path to common PDF resources. The library is usually
322 // initialized only once, but calling initialize() multiple times is also fine.
323 PDFNet.initialize(PDFTronLicense.Key());
324 PDFNet.addResourceSearchPath("../../../Lib/");
325
326 testTabularData();
327 testDocumentStructure();
328 testFormFields();
329 testGenericKeyValue();
330 testDocClassifier();
331
332 PDFNet.terminate();
333 }
334}
335
1 <?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10//---------------------------------------------------------------------------------------
11// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
12// extract various types of data from PDF documents.
13//
14// The Apryse SDK Data Extraction suite can be downloaded from
15// https://docs.apryse.com/core/guides/info/modules
16//
17// Please contact us if you have any questions.
18//---------------------------------------------------------------------------------------
19
20function WriteTextToFile($outputFile, $text)
21{
22 $outfile = fopen($outputFile, "w");
23 fwrite($outfile, $text);
24 fclose($outfile);
25}
26
27function main()
28{
29 // Relative path to the folder containing the test files.
30 $inputPath = getcwd()."/../../TestFiles/";
31 $outputPath = $inputPath."Output/";
32
33 // The first step in every application using PDFNet is to initialize the
34 // library. The library is usually initialized only once, but calling
35 // Initialize() multiple times is also fine.
36 global $LicenseKey;
37 PDFNet::Initialize($LicenseKey);
38 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
39
40 //-----------------------------------------------------------------------------------
41
42 PDFNet::AddResourceSearchPath("../../../PDFNetC/Lib/");
43
44 //////////////////////////////////////////////////////////////////////////
45 // The following sample illustrates how to extract tables from PDF documents.
46 //////////////////////////////////////////////////////////////////////////
47
48 // Test if the add-on is installed
49 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular)) {
50 echo(nl2br("\n"));
51 echo(nl2br("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.\n"));
52 echo(nl2br("-----------------------------------------------------------------------------\n"));
53 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
54 echo(nl2br("at https://docs.apryse.com/core/guides/info/modules. If you have already\n"));
55 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
56 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
57 echo(nl2br("\n"));
58 }
59 else {
60 try {
61 // Extract tabular data as a JSON file
62 echo(nl2br("Extract tabular data as a JSON file\n"));
63
64 $outputFile = $outputPath."table.json";
65 DataExtractionModule::ExtractData($inputPath."table.pdf", $outputFile, DataExtractionModule::e_Tabular);
66
67 echo(nl2br("Result saved in " . $outputFile . "\n"));
68
69 ///////////////////////////////////////////////////////
70 // Extract tabular data as a JSON string
71 echo(nl2br("Extract tabular data as a JSON string\n"));
72
73 $outputFile = $outputPath."financial.json";
74 $json = DataExtractionModule::ExtractData($inputPath."financial.pdf", DataExtractionModule::e_Tabular);
75 WriteTextToFile($outputFile, $json);
76
77 echo(nl2br("Result saved in " . $outputFile . "\n"));
78
79 ///////////////////////////////////////////////////////
80 // Extract tabular data as an XLSX file
81 echo(nl2br("Extract tabular data as an XLSX file\n"));
82
83 $outputFile = $outputPath."table.xlsx";
84 DataExtractionModule::ExtractToXLSX($inputPath."table.pdf", $outputFile);
85
86 echo(nl2br("Result saved in " . $outputFile . "\n"));
87
88 ///////////////////////////////////////////////////////
89 // Extract tabular data as an XLSX stream (also known as filter)
90 echo(nl2br("Extract tabular data as an XLSX stream\n"));
91
92 $outputFile = $outputPath."financial.xlsx";
93 $outputXlsxStream = new MemoryFilter(0, false);
94 $options = new DataExtractionOptions();
95 $options->SetPages("1"); // page 1
96 DataExtractionModule::ExtractToXLSX($inputPath."financial.pdf", $outputXlsxStream, $options);
97 $outputXlsxStream->SetAsInputFilter();
98 $outputXlsxStream->WriteToFile($outputFile, false);
99
100 echo(nl2br("Result saved in " . $outputFile . "\n"));
101 }
102 catch(Exception $e) {
103 echo(nl2br("Unable to extract tabular data, error: " . $e->getMessage() . "\n"));
104 }
105 }
106
107 //////////////////////////////////////////////////////////////////////////
108 // The following sample illustrates how to extract document structure from PDF documents.
109 //////////////////////////////////////////////////////////////////////////
110
111 // Test if the add-on is installed
112 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure)) {
113 echo(nl2br("\n"));
114 echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
115 echo(nl2br("-----------------------------------------------------------------------------\n"));
116 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
117 echo(nl2br("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module. If you have already\n"));
118 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
119 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
120 echo(nl2br("\n"));
121 }
122 else {
123 try {
124 // Extract document structure as a JSON file
125 echo(nl2br("Extract document structure as a JSON file\n"));
126
127 $outputFile = $outputPath."paragraphs_and_tables.json";
128 DataExtractionModule::ExtractData($inputPath."paragraphs_and_tables.pdf", $outputFile, DataExtractionModule::e_DocStructure);
129
130 echo(nl2br("Result saved in " . $outputFile . "\n"));
131
132 ///////////////////////////////////////////////////////
133 // Extract document structure as a JSON string
134 echo(nl2br("Extract document structure as a JSON string\n"));
135
136 $outputFile = $outputPath."tagged.json";
137 $json = DataExtractionModule::ExtractData($inputPath."tagged.pdf", DataExtractionModule::e_DocStructure);
138 WriteTextToFile($outputFile, $json);
139
140 echo(nl2br("Result saved in " . $outputFile . "\n"));
141 }
142 catch(Exception $e) {
143 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
144 }
145 }
146
147 //////////////////////////////////////////////////////////////////////////
148 // The following sample illustrates how to extract form fields from PDF documents.
149 //////////////////////////////////////////////////////////////////////////
150
151 // Test if the add-on is installed
152 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form)) {
153 echo(nl2br("\n"));
154 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.\n"));
155 echo(nl2br("-----------------------------------------------------------------------------\n"));
156 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
157 echo(nl2br("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already\n"));
158 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
159 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
160 echo(nl2br("\n"));
161 }
162 else {
163 try {
164 // Extract form fields as a JSON file
165 echo(nl2br("Extract form fields as a JSON file\n"));
166
167 $outputFile = $outputPath."formfields-scanned.json";
168 DataExtractionModule::ExtractData($inputPath."formfields-scanned.pdf", $outputFile, DataExtractionModule::e_Form);
169
170 echo(nl2br("Result saved in " . $outputFile . "\n"));
171
172 ///////////////////////////////////////////////////////
173 // Extract form fields as a JSON string
174 echo(nl2br("Extract form fields as a JSON string\n"));
175
176 $outputFile = $outputPath."formfields.json";
177 $json = DataExtractionModule::ExtractData($inputPath."formfields.pdf", DataExtractionModule::e_Form);
178 WriteTextToFile($outputFile, $json);
179
180 echo(nl2br("Result saved in " . $outputFile . "\n"));
181
182 ///////////////////////////////////////////////////////
183 // Detect and add form fields to a PDF document.
184 // PDF document already has form fields, and this sample will update to new found fields.
185 echo(nl2br("Extract form fields as a PDF file\n"));
186
187 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
188 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc);
189 $doc->Save($outputPath."formfields-scanned-fields-new.pdf", SDFDoc::e_linearized);
190 $doc->Close();
191
192 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-new.pdf" . "\n"));
193
194 ///////////////////////////////////////////////////////
195 // Detect and add form fields to a PDF document.
196 // PDF document already has form fields, and this sample will keep the original fields.
197 echo(nl2br("Extract form fields as a PDF file\n"));
198
199 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
200 $options = new DataExtractionOptions();
201 $options->SetOverlappingFormFieldBehavior("KeepOld");
202 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc, $options);
203 $doc->Save($outputPath."formfields-scanned-fields-old.pdf", SDFDoc::e_linearized);
204 $doc->Close();
205
206 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-old.pdf" . "\n"));
207
208 }
209 catch(Exception $e) {
210 echo(nl2br("Unable to extract form fields data, error: " . $e->getMessage() . "\n"));
211 }
212 }
213
214 //////////////////////////////////////////////////////////////////////////
215 // The following sample illustrates how to extract document structure from PDF documents.
216 //////////////////////////////////////////////////////////////////////////
217
218 // Test if the add-on is installed
219 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue)) {
220 echo(nl2br("\n"));
221 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.\n"));
222 echo(nl2br("-----------------------------------------------------------------------------\n"));
223 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
224 echo(nl2br("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already\n"));
225 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
226 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
227 echo(nl2br("\n"));
228 }
229 else {
230 try {
231
232 echo(nl2br("Extract key-value pairs from a PDF\n"));
233 // Simple example: Extract Keys & Values as a JSON file
234 $outputFile = $outputPath."newsletter_key_val.json";
235 DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue);
236
237 echo(nl2br("Result saved in " . $outputFile . "\n"));
238
239 // Example with customized options:
240 // Extract Keys & Values from pages 2-4, excluding ads
241 $options = new DataExtractionOptions();
242 $options->setPages("2-4");
243
244 $p2ExclusionZones = new RectCollection();
245 // Exclude the add-on page 2
246 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
247 // Coordinates rotate with the page, if it has rotation applied.
248 $p2ExclusionZones->AddRect(new Rect(166.0, 47.0, 562.0, 222.0));
249 $options->AddExclusionZonesForPage($p2ExclusionZones, 2);
250
251 $p4InclusionZones = new RectCollection();
252 $p4ExclusionZones = new RectCollection();
253 // Only include the article text for page 4, exclude ads and headings
254 $p4InclusionZones->AddRect(new Rect(30.0, 432.0, 562.0, 684.0));
255 $p4ExclusionZones->AddRect(new Rect(30.0, 657.0, 295.0, 684.0));
256 $options->AddInclusionZonesForPage($p4InclusionZones, 4);
257 $options->AddExclusionZonesForPage($p4ExclusionZones, 4);
258
259 echo(nl2br("Extract Key-Value pairs from specific pages and zones as a JSON file\n"));
260 $outputFile = $outputPath."newsletter_key_val_with_zones.json";
261 DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue, $options);
262
263 echo(nl2br("Result saved in " . $outputFile . "\n"));
264 }
265 catch(Exception $e) {
266 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
267 }
268 }
269
270 //////////////////////////////////////////////////////////////////////////
271 // The following sample illustrates how to extract document classes from PDF documents.
272 //////////////////////////////////////////////////////////////////////////
273
274 // Test if the add-on is installed
275 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification)) {
276 echo(nl2br("\n"));
277 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.\n"));
278 echo(nl2br("-----------------------------------------------------------------------------\n"));
279 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
280 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
281 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
282 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
283 echo(nl2br("\n"));
284 }
285 else {
286 try {
287 // Simple example: classify pages as a JSON file
288 echo(nl2br("Classify pages as a JSON file\n"));
289
290 $outputFile = $outputPath."Invoice_Classified.json";
291 DataExtractionModule::ExtractData($inputPath."Invoice.pdf", $outputFile, DataExtractionModule::e_DocClassification);
292
293 echo(nl2br("Result saved in " . $outputFile . "\n"));
294
295 ///////////////////////////////////////////////////////
296 // Classify pages as a JSON string
297 echo(nl2br("Classify pages as a JSON string\n"));
298
299 $outputFile = $outputPath."Scientific_Publication_Classified.json";
300 $json = DataExtractionModule::ExtractData($inputPath."Scientific_Publication.pdf", DataExtractionModule::e_DocClassification);
301 WriteTextToFile($outputFile, $json);
302
303 echo(nl2br("Result saved in " . $outputFile . "\n"));
304
305 ///////////////////////////////////////////////////////
306 // Example with customized options:
307 echo(nl2br("Classify pages with customized options\n"));
308
309 $options = new DataExtractionOptions();
310 // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
311 $options->SetMinimumConfidenceThreshold(0.7);
312 $outputFile = $outputPath."Email_Classified.json";
313 DataExtractionModule::ExtractData($inputPath."Email.pdf", $outputFile, DataExtractionModule::e_DocClassification, $options);
314
315 echo(nl2br("Result saved in " . $outputFile . "\n"));
316 }
317 catch(Exception $e) {
318 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
319 }
320 }
321
322 //-----------------------------------------------------------------------------------
323
324 PDFNet::Terminate();
325 echo(nl2br("Done.\n"));
326}
327
328main();
329?>
330
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
8// extract various types of data from PDF documents.
9//
10// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
11//---------------------------------------------------------------------------------------
12
13const fs = require('fs');
14const { PDFNet } = require('../../lib/pdfnet.js');
15const PDFTronLicense = require('../../LicenseKey/NODEJS/LicenseKey');
16
17((exports) => {
18 'use strict';
19
20 exports.runDataExtractionTest = () => {
21
22 const main = async () => {
23
24 const inputPath = '../TestFiles/';
25 const outputPath = '../TestFiles/Output/';
26
27 //////////////////////////////////////////////////////////////////////////
28
29 await PDFNet.addResourceSearchPath('../../lib/');
30
31 //////////////////////////////////////////////////////////////////////////
32 // The following sample illustrates how to extract tables from PDF documents.
33 //////////////////////////////////////////////////////////////////////////
34
35 // Test if the add-on is installed
36 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular)) {
37 console.log('\nUnable to run Data Extraction: Apryse SDK Tabular Data module not available.');
38 console.log('---------------------------------------------------------------');
39 console.log('The Data Extraction suite is an optional add-on, available for download');
40 console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already');
41 console.log('downloaded this module, ensure that the SDK is able to find the required files');
42 console.log('using the PDFNet.addResourceSearchPath() function.\n');
43 }
44 else
45 {
46 try {
47 // Extract tabular data as a JSON file
48 console.log('Extract tabular data as a JSON file');
49
50 let outputFile = outputPath + 'table.json';
51 await PDFNet.DataExtractionModule.extractData(inputPath + 'table.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
52
53 console.log('Result saved in ' + outputFile);
54
55 ///////////////////////////////////////////////////////
56 // Extract tabular data as a JSON string
57 console.log('Extract tabular data as a JSON string');
58
59 outputFile = outputPath + 'financial.json';
60 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'financial.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
61 fs.writeFileSync(outputFile, json);
62
63 console.log('Result saved in ' + outputFile);
64
65 ///////////////////////////////////////////////////////
66 // Extract tabular data as an XLSX file
67 console.log('Extract tabular data as an XLSX file');
68
69 outputFile = outputPath + 'table.xlsx';
70 await PDFNet.DataExtractionModule.extractToXLSX(inputPath + 'table.pdf', outputFile);
71
72 console.log('Result saved in ' + outputFile);
73
74 ///////////////////////////////////////////////////////
75 // Extract tabular data as an XLSX stream (also known as filter)
76 console.log('Extract tabular data as an XLSX stream');
77
78 outputFile = outputPath + 'financial.xlsx';
79 const outputXlsxStream = await PDFNet.Filter.createMemoryFilter(0, false);
80 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
81 options.setPages('1'); // page 1
82 await PDFNet.DataExtractionModule.extractToXLSXWithFilter(inputPath + 'financial.pdf', outputXlsxStream, options);
83 outputXlsxStream.memoryFilterSetAsInputFilter();
84 outputXlsxStream.writeToFile(outputFile, false);
85
86 console.log('Result saved in ' + outputFile);
87 } catch (err) {
88 console.log(err);
89 }
90 }
91
92 //////////////////////////////////////////////////////////////////////////
93 // The following sample illustrates how to extract document structure from PDF documents.
94 //////////////////////////////////////////////////////////////////////////
95
96 // Test if the add-on is installed
97 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure)) {
98 console.log('\nUnable to run Data Extraction: Apryse SDK Structured Output module not available.');
99 console.log('---------------------------------------------------------------');
100 console.log('The Data Extraction suite is an optional add-on, available for download');
101 console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already');
102 console.log('downloaded this module, ensure that the SDK is able to find the required files');
103 console.log('using the PDFNet.addResourceSearchPath() function.\n');
104 }
105 else
106 {
107 try {
108 // Extract document structure as a JSON file
109 console.log('Extract document structure as a JSON file');
110
111 let outputFile = outputPath + 'paragraphs_and_tables.json';
112 await PDFNet.DataExtractionModule.extractData(inputPath + 'paragraphs_and_tables.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
113
114 console.log('Result saved in ' + outputFile);
115
116 ///////////////////////////////////////////////////////
117 // Extract document structure as a JSON string
118 console.log('Extract document structure as a JSON string');
119
120 outputFile = outputPath + 'tagged.json';
121 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'tagged.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
122 fs.writeFileSync(outputFile, json);
123
124 console.log('Result saved in ' + outputFile);
125 } catch (err) {
126 console.log(err);
127 }
128 }
129
130 //////////////////////////////////////////////////////////////////////////
131 // The following sample illustrates how to extract form fields from PDF documents.
132 //////////////////////////////////////////////////////////////////////////
133
134 // Test if the add-on is installed
135 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Form)) {
136 console.log('\nUnable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.');
137 console.log('---------------------------------------------------------------');
138 console.log('The Data Extraction suite is an optional add-on, available for download');
139 console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already');
140 console.log('downloaded this module, ensure that the SDK is able to find the required files');
141 console.log('using the PDFNet.addResourceSearchPath() function.\n');
142 }
143 else
144 {
145 try {
146 // Extract form fields as a JSON file
147 console.log('Extract form fields as a JSON file');
148
149 let outputFile = outputPath + 'formfields-scanned.json';
150 await PDFNet.DataExtractionModule.extractData(inputPath + 'formfields-scanned.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
151
152 console.log('Result saved in ' + outputFile);
153
154 ///////////////////////////////////////////////////////
155 // Extract form fields as a JSON string
156 console.log('Extract form fields as a JSON string');
157
158 outputFile = outputPath + 'formfields.json';
159 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'formfields.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
160 fs.writeFileSync(outputFile, json);
161
162 console.log('Result saved in ' + outputFile);
163
164 //////////////////////////////////////////////////////////////////////////
165 // Detect and add form fields to a PDF document.
166 // Document already has form fields, and this sample will update to new found fields.
167 {
168 console.log('Detect and add form fields in a PDF file, keep new fields');
169
170 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
171
172 await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
173 outputFile = outputPath + 'formfields-scanned-fields-new.pdf';
174 await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
175
176 console.log('Result saved in ' + outputFile);
177 }
178
179 //////////////////////////////////////////////////////////////////////////
180 // Detect and add form fields to a PDF document.
181 // Document already has form fields, and this sample will keep the original fields.
182 {
183 console.log('Detect and add form fields in a PDF file, keep old fields');
184
185 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
186
187 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
188 options.setOverlappingFormFieldBehavior('KeepOld');
189
190 await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
191 outputFile = outputPath + 'formfields-scanned-fields-old.pdf';
192 await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
193 }
194
195 console.log('Result saved in ' + outputFile);
196
197 } catch (err) {
198 console.log(err);
199 }
200 }
201
202 //////////////////////////////////////////////////////////////////////////
203 // The following sample illustrates how to extract key-value pairs from PDF documents.
204 //////////////////////////////////////////////////////////////////////////
205 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue)) {
206 console.log();
207 console.log('Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.');
208 console.log('---------------------------------------------------------------');
209 console.log('The Data Extraction suite is an optional add-on, available for download');
210 console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this');
211 console.log('module, ensure that the SDK is able to find the required files');
212 console.log('using the PDFNet.addResourceSearchPath() function.');
213 console.log();
214 }
215 else
216 {
217 try {
218 // Simple example: Extract Keys & Values as a JSON file
219 console.log('Extract Key-Value pairs as a JSON file');
220 await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue);
221 console.log('Result saved in ' + outputPath + 'newsletter_key_val.json');
222
223 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
224 options.setPages('2-4');
225
226 const p2ExclusionZones = [];
227 // Exclude the add-on page 2
228 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
229 // Coordinates rotate with the page, if it has rotation applied.
230 p2ExclusionZones.push(new PDFNet.Rect(166, 47, 562, 222));
231 options.addExclusionZonesForPage(p2ExclusionZones, 2);
232
233 const p4InclusionZones = [];
234 const p4ExclusionZones = [];
235 // Only include the article text for page 4, exclude ads and headings
236 p4InclusionZones.push(new PDFNet.Rect(30, 432, 562, 684));
237 p4ExclusionZones.push(new PDFNet.Rect(30, 657, 295, 684));
238 options.addInclusionZonesForPage(p4InclusionZones, 4);
239 options.addExclusionZonesForPage(p4ExclusionZones, 4);
240 console.log('Extract Key-Value pairs from specific pages and zones as a JSON file');
241 await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val_with_zones.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue, options);
242 console.log('Result saved in ' + outputPath + 'newsletter_key_val_with_zones.json');
243 } catch (err) {
244 console.log(err);
245 }
246 }
247
248 //////////////////////////////////////////////////////////////////////////
249 // The following sample illustrates how to extract document classes from PDF documents.
250 //////////////////////////////////////////////////////////////////////////
251
252 // Test if the add-on is installed
253 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification)) {
254 console.log('\nUnable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.');
255 console.log('---------------------------------------------------------------');
256 console.log('The Data Extraction suite is an optional add-on, available for download');
257 console.log('at https://docs.apryse.com/documentation/core/info/modules/. If you have already');
258 console.log('downloaded this module, ensure that the SDK is able to find the required files');
259 console.log('using the PDFNet.addResourceSearchPath() function.\n');
260 }
261 else
262 {
263 try {
264 // Simple example: classify pages as a JSON file
265 console.log('Classify pages as a JSON file');
266
267 let outputFile = outputPath + 'Invoice_Classified.json';
268 await PDFNet.DataExtractionModule.extractData(inputPath + 'Invoice.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification);
269
270 console.log('Result saved in ' + outputFile);
271
272 ///////////////////////////////////////////////////////
273 // Classify pages as a JSON string
274 console.log('Classify pages as a JSON string');
275
276 outputFile = outputPath + 'Scientific_Publication_Classified.json';
277 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'Scientific_Publication.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification);
278 fs.writeFileSync(outputFile, json);
279
280 console.log('Result saved in ' + outputFile);
281
282 ///////////////////////////////////////////////////////
283 // Example with customized options:
284 console.log('Classify pages with customized options');
285
286 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
287 // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
288 options.setMinimumConfidenceThreshold(0.7);
289 outputFile = outputPath + 'Email_Classified.json';
290 await PDFNet.DataExtractionModule.extractData(inputPath + 'Email.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification, options);
291
292 console.log('Result saved in ' + outputFile);
293
294 } catch (err) {
295 console.log(err);
296 }
297 }
298
299 //////////////////////////////////////////////////////////////////////////
300
301 console.log('Done.');
302 };
303
304 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) {
305 console.log('Error: ' + JSON.stringify(error));
306 }).then(function () { return PDFNet.shutdown(); });
307 };
308 exports.runDataExtractionTest();
309})(exports);
310// eslint-disable-next-line spaced-comment
311//# sourceURL=DataExtractionTest.js
312
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11import platform
12
13sys.path.append("../../LicenseKey/PYTHON")
14from LicenseKey import *
15
16#---------------------------------------------------------------------------------------
17# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18# extract various types of data from PDF documents.
19#
20# The Apryse SDK Data Extraction suite can be downloaded from
21# https://docs.apryse.com/core/guides/info/modules#data-extraction-module
22#
23# Please contact us if you have any questions.
24#---------------------------------------------------------------------------------------
25
26# Relative path to the folder containing the test files.
27inputPath = "../../TestFiles/"
28outputPath = "../../TestFiles/Output/"
29
30def WriteTextToFile(outputFile, text):
31 # Write the contents of text to the disk
32 f = open(outputFile, "w")
33 try:
34 f.write(text)
35 finally:
36 f.close()
37
38def main():
39 # The first step in every application using PDFNet is to initialize the
40 # library. The library is usually initialized only once, but calling
41 # Initialize() multiple times is also fine.
42 PDFNet.Initialize(LicenseKey)
43
44 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
45
46 #-----------------------------------------------------------------------------------
47 # The following sample illustrates how to extract tables from PDF documents.
48 #-----------------------------------------------------------------------------------
49
50 # Test if the add-on is installed
51 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Tabular):
52 print("")
53 print("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
54 print("-----------------------------------------------------------------------------")
55 print("The Data Extraction suite is an optional add-on, available for download")
56 print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
57 print("downloaded this module, ensure that the SDK is able to find the required files")
58 print("using the PDFNet.AddResourceSearchPath() function.")
59 print("")
60 else:
61 try:
62 # Extract tabular data as a JSON file
63 print("Extract tabular data as a JSON file")
64
65 outputFile = outputPath + "table.json"
66 DataExtractionModule.ExtractData(inputPath + "table.pdf", outputFile, DataExtractionModule.e_Tabular)
67
68 print("Result saved in " + outputFile)
69
70 #------------------------------------------------------
71 # Extract tabular data as a JSON string
72 print("Extract tabular data as a JSON string")
73
74 outputFile = outputPath + "financial.json"
75 json = DataExtractionModule.ExtractData(inputPath + "financial.pdf", DataExtractionModule.e_Tabular)
76 WriteTextToFile(outputFile, json)
77
78 print("Result saved in " + outputFile)
79
80 #------------------------------------------------------
81 # Extract tabular data as an XLSX file
82 print("Extract tabular data as an XLSX file")
83
84 outputFile = outputPath + "table.xlsx"
85 DataExtractionModule.ExtractToXLSX(inputPath + "table.pdf", outputFile)
86
87 print("Result saved in " + outputFile)
88
89 #------------------------------------------------------
90 # Extract tabular data as an XLSX stream (also known as filter)
91 print("Extract tabular data as an XLSX stream")
92
93 outputFile = outputPath + "financial.xlsx"
94 options = DataExtractionOptions()
95 options.SetPages("1") # page 1
96 outputXlsxStream = MemoryFilter(0, False)
97 DataExtractionModule.ExtractToXLSX(inputPath + "financial.pdf", outputXlsxStream, options)
98 outputXlsxStream.SetAsInputFilter()
99 outputXlsxStream.WriteToFile(outputFile, False)
100
101 print("Result saved in " + outputFile)
102 except Exception as e:
103 print("Unable to extract tabular data, error: " + str(e))
104
105 #-----------------------------------------------------------------------------------
106 # The following sample illustrates how to extract document structure from PDF documents.
107 #-----------------------------------------------------------------------------------
108
109 # Test if the add-on is installed
110 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocStructure):
111 print("")
112 print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
113 print("-----------------------------------------------------------------------------")
114 print("The Data Extraction suite is an optional add-on, available for download")
115 print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
116 print("downloaded this module, ensure that the SDK is able to find the required files")
117 print("using the PDFNet.AddResourceSearchPath() function.")
118 print("")
119 else:
120 try:
121 # Extract document structure as a JSON file
122 print("Extract document structure as a JSON file")
123
124 outputFile = outputPath + "paragraphs_and_tables.json"
125 DataExtractionModule.ExtractData(inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule.e_DocStructure)
126
127 print("Result saved in " + outputFile)
128
129 #------------------------------------------------------
130 # Extract document structure as a JSON string
131 print("Extract document structure as a JSON string")
132
133 outputFile = outputPath + "tagged.json"
134 json = DataExtractionModule.ExtractData(inputPath + "tagged.pdf", DataExtractionModule.e_DocStructure)
135 WriteTextToFile(outputFile, json)
136
137 print("Result saved in " + outputFile)
138 except Exception as e:
139 print("Unable to extract document structure data, error: " + str(e))
140
141 #-----------------------------------------------------------------------------------
142 # The following sample illustrates how to extract form fields from PDF documents.
143 #-----------------------------------------------------------------------------------
144
145 # Test if the add-on is installed
146 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Form):
147 print("")
148 print("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
149 print("-----------------------------------------------------------------------------")
150 print("The Data Extraction suite is an optional add-on, available for download")
151 print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
152 print("downloaded this module, ensure that the SDK is able to find the required files")
153 print("using the PDFNet.AddResourceSearchPath() function.")
154 print("")
155 else:
156 try:
157 # Extract form fields as a JSON file
158 print("Extract form fields as a JSON file")
159
160 outputFile = outputPath + "formfields-scanned.json"
161 DataExtractionModule.ExtractData(inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule.e_Form)
162
163 print("Result saved in " + outputFile)
164
165 #------------------------------------------------------
166 # Extract form fields as a JSON string
167 print("Extract form fields as a JSON string")
168
169 outputFile = outputPath + "formfields.json"
170 json = DataExtractionModule.ExtractData(inputPath + "formfields.pdf", DataExtractionModule.e_Form)
171 WriteTextToFile(outputFile, json)
172
173 print("Result saved in " + outputFile)
174
175 #-----------------------------------------------------------------------------------
176 # Detect and add form fields to a PDF document.
177 # PDF document already has form fields, and this sample will update to new found fields.
178 print("Extract form fields as a pdf file, update to new")
179
180 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
181
182 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
183
184 outputFile = outputPath + "formfields-scanned-fields-new.pdf"
185 doc.Save(outputFile, SDFDoc.e_linearized)
186 doc.Close()
187
188 print("Result saved in " + outputFile)
189
190 #-----------------------------------------------------------------------------------
191 # Detect and add form fields to a PDF document.
192 # PDF document already has form fields, and this sample will keep the original fields.
193 print("Extract form fields as a pdf file, keep original")
194
195 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
196
197 options = DataExtractionOptions()
198 options.SetOverlappingFormFieldBehavior("KeepOld")
199 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
200
201 outputFile = outputPath + "formfields-scanned-fields-old.pdf"
202 doc.Save(outputFile, SDFDoc.e_linearized)
203 doc.Close()
204
205 print("Result saved in " + outputFile)
206
207 except Exception as e:
208 print("Unable to extract form fields data, error: " + str(e))
209
210 #---------------------------------------------------------------------------------------
211 # The following sample illustrates how to extract key-value pairs from PDF documents.
212 #---------------------------------------------------------------------------------------
213 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_GenericKeyValue):
214 print()
215 print("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
216 print("---------------------------------------------------------------")
217 print("The Data Extraction suite is an optional add-on, available for download")
218 print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
219 print("module, ensure that the SDK is able to find the required files")
220 print("using the PDFNet.AddResourceSearchPath() function.")
221 print()
222 else:
223 try:
224 print("Extract key-value pairs from a PDF")
225 # Simple example: Extract Keys & Values as a JSON file
226 DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val.json", DataExtractionModule.e_GenericKeyValue)
227 print("Result saved in " + outputPath + "newsletter_key_val.json")
228
229 # Example with customized options:
230 # Extract Keys & Values from pages 2-4, excluding ads
231 options = DataExtractionOptions()
232 options.SetPages("2-4")
233
234 p2_exclusion_zones = RectCollection()
235 # Exclude the add-on on page 2
236 # These coordinates are in PDF user space, with the origin at the bottom left corner of the page
237 # Coordinates rotate with the page, if it has rotation applied.
238 p2_exclusion_zones.AddRect(Rect(166, 47, 562, 222))
239 options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
240
241 p4_inclusion_zones = RectCollection()
242 p4_exclusion_zones = RectCollection()
243 # Only include the article text for page 4, exclude ads and headings
244 p4_inclusion_zones.AddRect(Rect(30, 432, 562, 684))
245 p4_exclusion_zones.AddRect(Rect(30, 657, 295, 684))
246 options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
247 options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
248 print("Extract Key-Value pairs from specific pages and zones as a JSON file")
249 DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule.e_GenericKeyValue, options)
250 print("Result saved in " + outputPath + "newsletter_key_val_with_zones.json")
251 except Exception as e:
252 print("Unable to extract key-value data, error: " + str(e))
253
254
255 #-----------------------------------------------------------------------------------
256 # The following sample illustrates how to extract document classes from PDF documents.
257 #-----------------------------------------------------------------------------------
258
259 # Test if the add-on is installed
260 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocClassification):
261 print("")
262 print("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
263 print("-----------------------------------------------------------------------------")
264 print("The Data Extraction suite is an optional add-on, available for download")
265 print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
266 print("downloaded this module, ensure that the SDK is able to find the required files")
267 print("using the PDFNet.AddResourceSearchPath() function.")
268 print("")
269 else:
270 try:
271 # Simple example: classify pages as a JSON file
272 print("Classify pages as a JSON file")
273
274 outputFile = outputPath + "Invoice_Classified.json"
275 DataExtractionModule.ExtractData(inputPath + "Invoice.pdf", outputFile, DataExtractionModule.e_DocClassification)
276
277 print("Result saved in " + outputFile)
278
279 #------------------------------------------------------
280 # Classify pages as a JSON string
281 print("Classify pages as a JSON string")
282
283 outputFile = outputPath + "Scientific_Publication_Classified.json"
284 json = DataExtractionModule.ExtractData(inputPath + "Scientific_Publication.pdf", DataExtractionModule.e_DocClassification)
285 WriteTextToFile(outputFile, json)
286
287 print("Result saved in " + outputFile)
288
289 #------------------------------------------------------
290 # Example with customized options:
291 print("Classify pages with customized options")
292
293 options = DataExtractionOptions()
294 # Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
295 options.SetMinimumConfidenceThreshold(0.7)
296 outputFile = outputPath + "Email_Classified.json"
297 DataExtractionModule.ExtractData(inputPath + "Email.pdf", outputFile, DataExtractionModule.e_DocClassification, options)
298
299 print("Result saved in " + outputFile)
300
301 except Exception as e:
302 print("Unable to extract document structure data, error: " + str(e))
303
304 PDFNet.Terminate()
305 print("Done.")
306
307if __name__ == '__main__':
308 main()
309
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
14# extract various types of data from PDF documents.
15#
16# The Apryse SDK Data Extraction suite can be downloaded from
17# https://docs.apryse.com/core/guides/info/modules#data-extraction-module
18#
19# Please contact us if you have any questions.
20#---------------------------------------------------------------------------------------
21
22# Relative path to the folder containing the test files.
23$inputPath = "../../TestFiles/"
24$outputPath = "../../TestFiles/Output/"
25
26def main()
27 # The first step in every application using PDFNet is to initialize the
28 # library. The library is usually initialized only once, but calling
29 # Initialize() multiple times is also fine.
30 PDFNet.Initialize(PDFTronLicense.Key)
31
32 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
33
34 #-----------------------------------------------------------------------------------
35 # The following sample illustrates how to extract tables from PDF documents.
36 #-----------------------------------------------------------------------------------
37
38 # Test if the add-on is installed
39 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Tabular) then
40 puts ""
41 puts "Unable to run Data Extraction: Apryse SDK Tabular Data module not available."
42 puts "-----------------------------------------------------------------------------"
43 puts "The Data Extraction suite is an optional add-on, available for download"
44 puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
45 puts "downloaded this module, ensure that the SDK is able to find the required files"
46 puts "using the PDFNet.AddResourceSearchPath() function."
47 puts ""
48 else
49 begin
50 # Extract tabular data as a JSON file
51 puts "Extract tabular data as a JSON file"
52
53 outputFile = $outputPath + "table.json"
54 DataExtractionModule.ExtractData($inputPath + "table.pdf", outputFile, DataExtractionModule::E_Tabular)
55
56 puts "Result saved in " + outputFile
57
58 #------------------------------------------------------
59 # Extract tabular data as a JSON string
60 puts "Extract tabular data as a JSON string"
61
62 outputFile = $outputPath + "financial.json"
63 json = DataExtractionModule.ExtractData($inputPath + "financial.pdf", DataExtractionModule::E_Tabular)
64 File.open(outputFile, 'w') { |file| file.write(json) }
65
66 puts "Result saved in " + outputFile
67
68 #------------------------------------------------------
69 # Extract tabular data as an XLSX file
70 puts "Extract tabular data as an XLSX file"
71
72 outputFile = $outputPath + "table.xlsx"
73 DataExtractionModule.ExtractToXLSX($inputPath + "table.pdf", outputFile)
74
75 puts "Result saved in " + outputFile
76
77 #------------------------------------------------------
78 # Extract tabular data as an XLSX stream (also known as filter)
79 puts "Extract tabular data as an XLSX stream"
80
81 outputFile = $outputPath + "financial.xlsx"
82 outputXlsxStream = MemoryFilter.new(0, false)
83 options = DataExtractionOptions.new()
84 options.SetPages("1") # page 1
85 DataExtractionModule.ExtractToXLSX($inputPath + "financial.pdf", outputXlsxStream, options)
86 outputXlsxStream.SetAsInputFilter()
87 outputXlsxStream.WriteToFile(outputFile, false)
88
89 puts "Result saved in " + outputFile
90 rescue => error
91 puts "Unable to extract tabular data, error: " + error.message
92 end
93 end
94
95 #-----------------------------------------------------------------------------------
96 # The following sample illustrates how to extract document structure from PDF documents.
97 #-----------------------------------------------------------------------------------
98
99 # Test if the add-on is installed
100 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocStructure) then
101 puts ""
102 puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
103 puts "-----------------------------------------------------------------------------"
104 puts "The Data Extraction suite is an optional add-on, available for download"
105 puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
106 puts "downloaded this module, ensure that the SDK is able to find the required files"
107 puts "using the PDFNet.AddResourceSearchPath() function."
108 puts ""
109 else
110 begin
111 # Extract document structure as a JSON file
112 puts "Extract document structure as a JSON file"
113
114 outputFile = $outputPath + "paragraphs_and_tables.json"
115 DataExtractionModule.ExtractData($inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule::E_DocStructure)
116
117 puts "Result saved in " + outputFile
118
119 #------------------------------------------------------
120 # Extract document structure as a JSON string
121 puts "Extract document structure as a JSON string"
122
123 outputFile = $outputPath + "tagged.json"
124 json = DataExtractionModule.ExtractData($inputPath + "tagged.pdf", DataExtractionModule::E_DocStructure)
125 File.open(outputFile, 'w') { |file| file.write(json) }
126
127 puts "Result saved in " + outputFile
128 rescue => error
129 puts "Unable to extract document structure data, error: " + error.message
130 end
131 end
132
133 #-----------------------------------------------------------------------------------
134 # The following sample illustrates how to extract form fields from PDF documents.
135 #-----------------------------------------------------------------------------------
136
137 # Test if the add-on is installed
138 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Form) then
139 puts ""
140 puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
141 puts "-----------------------------------------------------------------------------"
142 puts "The Data Extraction suite is an optional add-on, available for download"
143 puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
144 puts "downloaded this module, ensure that the SDK is able to find the required files"
145 puts "using the PDFNet.AddResourceSearchPath() function."
146 puts ""
147 else
148 begin
149 # Extract form fields as a JSON file
150 puts "Extract form fields as a JSON file"
151
152 outputFile = $outputPath + "formfields-scanned.json"
153 DataExtractionModule.ExtractData($inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule::E_Form)
154
155 puts "Result saved in " + outputFile
156
157 #------------------------------------------------------
158 # Extract form fields as a JSON string
159 puts "Extract form fields as a JSON string"
160
161 outputFile = $outputPath + "formfields.json"
162 json = DataExtractionModule.ExtractData($inputPath + "formfields.pdf", DataExtractionModule::E_Form)
163 File.open(outputFile, 'w') { |file| file.write(json) }
164
165 puts "Result saved in " + outputFile
166
167 #-----------------------------------------------------------------------------------
168 # Detect and add form fields to a PDF document.
169 # PDF document already has form fields, and this sample will update to the new fields.
170 puts "Extract document structure as a PDF file"
171 doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
172
173 outputFile = $outputPath + "formfields-scanned-fields-new.pdf"
174
175 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
176 doc.Save(outputFile, SDFDoc::E_linearized);
177 doc.Close
178
179 puts "Result saved in " + outputFile
180
181 #-----------------------------------------------------------------------------------
182 # Detect and add form fields to a PDF document.
183 # PDF document already has form fields, and this sample will keep the original fields.
184 puts "Extract document structure as a PDF file"
185 doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
186
187 outputFile = $outputPath + "formfields-scanned-fields-old.pdf"
188
189 options = DataExtractionOptions.new()
190 options.SetOverlappingFormFieldBehavior("KeepOld")
191 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
192 doc.Save(outputFile, SDFDoc::E_linearized);
193 doc.Close
194
195 puts "Result saved in " + outputFile
196
197
198 rescue => error
199 puts "Unable to extract form fields data, error: " + error.message
200 end
201 end
202
203 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_GenericKeyValue) then
204 puts ""
205 puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
206 puts "-----------------------------------------------------------------------------"
207 puts "The Data Extraction suite is an optional add-on, available for download"
208 puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
209 puts "downloaded this module, ensure that the SDK is able to find the required files"
210 puts "using the PDFNet.AddResourceSearchPath() function."
211 puts ""
212 else
213 begin
214 puts "Extract key-value pairs from a PDF"
215 # Simple example: Extract Keys & Values as a JSON file
216 DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val.json", DataExtractionModule::E_GenericKeyValue)
217 puts "Result saved in " + $outputPath + "newsletter_key_val.json"
218
219 # Example with customized options:
220 # Extract Keys & Values from pages 2-4, excluding ads
221 options = DataExtractionOptions.new()
222 options.SetPages("2-4")
223
224 p2_exclusion_zones = RectCollection.new()
225 # Exclude the add-on on page 2
226 # These coordinates are in PDF user space, with the origin at the bottom left corner of the page
227 # Coordinates rotate with the page, if it has rotation applied.
228 p2_exclusion_zones.AddRect(Rect.new(166, 47, 562, 222))
229 options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
230
231 p4_inclusion_zones = RectCollection.new()
232 p4_exclusion_zones = RectCollection.new()
233 # Only include the article text for page 4, exclude ads and headings
234 p4_inclusion_zones.AddRect(Rect.new(30, 432, 562, 684))
235 p4_exclusion_zones.AddRect(Rect.new(30, 657, 295, 684))
236 options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
237 options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
238 puts "Extract Key-Value pairs from specific pages and zones as a JSON file"
239 DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule::E_GenericKeyValue, options)
240 puts "Result saved in " + $outputPath + "newsletter_key_val_with_zones.json"
241
242 rescue => error
243 puts "Unable to extract form fields data, error: " + error.message
244 end
245 end
246
247 #-----------------------------------------------------------------------------------
248 # The following sample illustrates how to extract document classes from PDF documents.
249 #-----------------------------------------------------------------------------------
250
251 # Test if the add-on is installed
252 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocClassification) then
253 puts ""
254 puts "Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available."
255 puts "-----------------------------------------------------------------------------"
256 puts "The Data Extraction suite is an optional add-on, available for download"
257 puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
258 puts "downloaded this module, ensure that the SDK is able to find the required files"
259 puts "using the PDFNet.AddResourceSearchPath() function."
260 puts ""
261 else
262 begin
263 # Simple example: classify pages as a JSON file
264 puts "Classify pages as a JSON file"
265
266 outputFile = $outputPath + "Invoice_Classified.json"
267 DataExtractionModule.ExtractData($inputPath + "Invoice.pdf", outputFile, DataExtractionModule::E_DocClassification)
268
269 puts "Result saved in " + outputFile
270
271 #------------------------------------------------------
272 # Classify pages as a JSON string
273 puts "Classify pages as a JSON string"
274
275 outputFile = $outputPath + "Scientific_Publication_Classified.json"
276 json = DataExtractionModule.ExtractData($inputPath + "Scientific_Publication.pdf", DataExtractionModule::E_DocClassification)
277 File.open(outputFile, 'w') { |file| file.write(json) }
278
279 puts "Result saved in " + outputFile
280
281 #------------------------------------------------------
282 # Example with customized options:
283 puts "Classify pages with customized options"
284
285 options = DataExtractionOptions.new()
286 # Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
287 options.SetMinimumConfidenceThreshold(0.7)
288 outputFile = $outputPath + "Email_Classified.json"
289 DataExtractionModule.ExtractData($inputPath + "Email.pdf", outputFile, DataExtractionModule::E_DocClassification, options)
290
291 puts "Result saved in " + outputFile
292
293 rescue => error
294 puts "Unable to extract document structure data, error: " + error.message
295 end
296 end
297
298 #-----------------------------------------------------------------------------------
299
300 PDFNet.Terminate
301 puts "Done."
302end
303
304main()
305
1'
2' Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports pdftron
6Imports pdftron.Common
7Imports pdftron.PDF
8Imports pdftron.Filters
9
10' The Data Extraction suite is an optional PDFNet add-on collection that can be used to
11' extract various types of data from PDF documents.
12' The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
13
14Module DataExtractionTestVB
15 Dim pdfNetLoader As PDFNetLoader
16 Sub New()
17 pdfNetLoader = pdftron.PDFNetLoader.Instance()
18 End Sub
19
20 ' Relative path to the folder containing test files.
21 Dim input_path As String = "../../../../TestFiles/"
22 Dim output_path As String = "../../../../TestFiles/Output/"
23
24 Sub Main()
25 PDFNet.Initialize(PDFTronLicense.Key)
26 PDFNet.AddResourceSearchPath("../../../../../Lib/")
27
28 TestTabularData()
29 TestDocumentStructure()
30 TestFormFields()
31 TestGenericKeyValue()
32 TestDocClassifier()
33
34 PDFNet.Terminate()
35 End Sub
36
37
38 ' The following sample illustrates how to extract tables from PDF documents.
39 Sub TestTabularData()
40 ' Test if the add-on is installed
41 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular) Then
42 Console.WriteLine()
43 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
44 Console.WriteLine("---------------------------------------------------------------")
45 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
46 Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
47 Console.WriteLine("module, ensure that the SDK is able to find the required files")
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
49 Console.WriteLine()
50 Return
51 End If
52
53 Try
54 ' Extract tabular data as a JSON file
55 DataExtractionModule.ExtractData(input_path & "table.pdf", output_path & "table.json", DataExtractionModule.DataExtractionEngine.e_tabular)
56
57 ' Extract tabular data as a JSON string
58 Dim json As String = DataExtractionModule.ExtractData(input_path & "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular)
59 System.IO.File.WriteAllText(output_path & "financial.json", json)
60
61 ' Extract tabular data as an XLSX file
62 DataExtractionModule.ExtractToXLSX(input_path & "table.pdf", output_path & "table.xlsx")
63
64 ' Extract tabular data as an XLSX stream (also known as filter)
65 Dim output_xlsx_stream As MemoryFilter = New MemoryFilter(0, False)
66 DataExtractionModule.ExtractToXLSX(input_path & "financial.pdf", output_xlsx_stream)
67 output_xlsx_stream.SetAsInputFilter()
68 output_xlsx_stream.WriteToFile(output_path & "financial.xlsx", False)
69
70 Catch e As PDFNetException
71 Console.WriteLine(e.Message)
72 End Try
73 End Sub
74
75
76 ' The following sample illustrates how to extract document structure from PDF documents.
77 Sub TestDocumentStructure()
78 ' Test if the add-on is installed
79 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure) Then
80 Console.WriteLine()
81 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.")
82 Console.WriteLine("---------------------------------------------------------------")
83 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
84 Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
85 Console.WriteLine("module, ensure that the SDK is able to find the required files")
86 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
87 Console.WriteLine()
88 Return
89 End If
90
91 Try
92 ' Extract document structure as a JSON file
93 DataExtractionModule.ExtractData(input_path & "paragraphs_and_tables.pdf", output_path & "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure)
94
95 ' Extract document structure as a JSON string
96 Dim json As String = DataExtractionModule.ExtractData(input_path & "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure)
97 System.IO.File.WriteAllText(output_path & "tagged.json", json)
98
99 Catch e As PDFNetException
100 Console.WriteLine(e.Message)
101 End Try
102 End Sub
103
104
105 ' The following sample illustrates how to extract form fields from PDF documents.
106 Sub TestFormFields()
107 ' Test if the add-on is installed
108 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form) Then
109 Console.WriteLine()
110 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.")
111 Console.WriteLine("---------------------------------------------------------------")
112 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
113 Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
114 Console.WriteLine("module, ensure that the SDK is able to find the required files")
115 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
116 Console.WriteLine()
117 Return
118 End If
119
120 Try
121 ' Extract form fields as a JSON file
122 DataExtractionModule.ExtractData(input_path & "formfields-scanned.pdf", output_path & "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form)
123
124 ' Extract form fields as a JSON string
125 Dim json As String = DataExtractionModule.ExtractData(input_path & "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form)
126 System.IO.File.WriteAllText(output_path & "formfields.json", json)
127
128 ' Detect and add form fields to a PDF document.
129 ' PDF document already has form fields, and this sample will update to new found fields.
130 Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
131 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
132 doc.Save(output_path & "formfields-scanned-fields-new.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
133 End Using
134
135 ' Detect and add form fields to a PDF document.
136 ' PDF document already has form fields, and this sample will keep the original fields.
137 Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
138 Dim options = New DataExtractionOptions()
139 options.SetOverlappingFormFieldBehavior("KeepOld")
140 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
141 doc.Save(output_path & "formfields-scanned-fields-old.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
142 End Using
143
144 Catch e As PDFNetException
145 Console.WriteLine(e.Message)
146 End Try
147
148 End Sub
149
150 ' The following sample illustrates how to extract key-value pairs from PDF documents.
151 Sub TestGenericKeyValue()
152 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value) Then
153 Console.WriteLine()
154 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
155 Console.WriteLine("---------------------------------------------------------------")
156 Console.WriteLine("Thehttps://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
157 Console.WriteLine("module, ensure that the SDK is able to find the required files")
158 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
159 Console.WriteLine()
160 Return
161 End If
162
163 ' Simple example: Extract Keys & Values as a JSON file
164 DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value)
165
166 ' Example with customized options:
167 ' Extract Keys & Values from pages 2-4, excluding ads
168 Dim options As New DataExtractionOptions()
169 options.SetPages("2-4")
170
171 Dim p2ExclusionZones As New RectCollection()
172 ' Exclude the add-on on page 2
173 ' These coordinates are in PDF user space, with the origin at the bottom left corner of the page
174 ' Coordinates rotate with the page, if it has rotation applied.
175 p2ExclusionZones.AddRect(166, 47, 562, 222)
176 options.AddExclusionZonesForPage(p2ExclusionZones, 2)
177
178 Dim p4InclusionZones As New RectCollection()
179 Dim p4ExclusionZones As New RectCollection()
180 ' Only include the article text for page 4, exclude ads and headings
181 p4InclusionZones.AddRect(30, 432, 562, 684)
182 p4ExclusionZones.AddRect(30, 657, 295, 684)
183 options.AddInclusionZonesForPage(p4InclusionZones, 4)
184 options.AddExclusionZonesForPage(p4ExclusionZones, 4)
185
186 DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val_with_zones.json",DataExtractionModule.DataExtractionEngine.e_generic_key_value, options)
187 End Sub
188
189 ' The following sample illustrates how to extract document classes from PDF documents.
190 Sub TestDocClassifier()
191 ' Test if the add-on is installed
192 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_classification) Then
193 Console.WriteLine()
194 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
195 Console.WriteLine("---------------------------------------------------------------")
196 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
197 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
198 Console.WriteLine("module, ensure that the SDK is able to find the required files")
199 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
200 Console.WriteLine()
201 Return
202 End If
203
204 Try
205 ' Simple example: classify pages as a JSON file
206 DataExtractionModule.ExtractData(input_path & "Invoice.pdf", output_path & "Invoice_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification)
207
208 ' Classify pages as a JSON string
209 Dim json As String = DataExtractionModule.ExtractData(input_path & "Scientific_Publication.pdf", DataExtractionModule.DataExtractionEngine.e_doc_classification)
210 System.IO.File.WriteAllText(output_path & "Scientific_Publication_Classified.json", json)
211
212 ' Example with customized options:
213 Dim options As New DataExtractionOptions()
214 ' Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
215 options.SetMinimumConfidenceThreshold(0.7)
216 DataExtractionModule.ExtractData(input_path & "Email.pdf", output_path & "Email_Classified.json",DataExtractionModule.DataExtractionEngine.e_doc_classification, options)
217
218 Catch e As PDFNetException
219 Console.WriteLine(e.Message)
220 End Try
221 End Sub
222
223End Module
224
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales