Smart Data Extraction - Python Sample Code

Requirements

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

Looking for data extraction + WebViewer UI? Check out our Document Structure Extraction - Showcase Sample Code

Learn more about our Server SDK and Smart Data Extraction.

Implementation steps

Get started with Server SDK in your language/framework
Download the Data Extraction Module
Add the sample code provided in this guide

To use this feature in production, your license key will need the Smart Data Extraction Package. Trial keys already include all packages.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7
8using pdftron;
9using pdftron.Common;
10using pdftron.PDF;
11using pdftron.SDF;
12using pdftron.Filters;
13
14namespace DataExtractionTestCS
15{
16	/// <summary>
17	///---------------------------------------------------------------------------------------
18	/// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
19	/// extract various types of data from PDF documents.
20	///
21	/// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
22	//---------------------------------------------------------------------------------------
23	/// </summary>
24	class Class1
25	{
26		private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
27		static Class1() { }
28
29		// Relative path to the folder containing test files.
30		static string input_path = "../../../../TestFiles/";
31		static string output_path = "../../../../TestFiles/Output/";
32
33
34		/// <summary>
35		/// The following sample illustrates how to extract tables from PDF documents.
36		/// </summary>
37		static void TestTabularData()
38		{
39			// Test if the add-on is installed
40			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
41			{
42				Console.WriteLine();
43				Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
44				Console.WriteLine("---------------------------------------------------------------");
45				Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
46				Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module .  If you have already downloaded this");
47				Console.WriteLine("module, ensure that the SDK is able to find the required files");
48				Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49				Console.WriteLine();
50				return;
51			}
52
53			try
54			{
55				// Extract tabular data as a JSON file
56				DataExtractionModule.ExtractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
57
58				// Extract tabular data as a JSON string
59				string json = DataExtractionModule.ExtractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
60				System.IO.File.WriteAllText(output_path + "financial.json", json);
61
62				// Extract tabular data as an XLSX file
63				DataExtractionModule.ExtractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
64
65				// Extract tabular data as an XLSX stream (also known as filter)
66				MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
67				DataExtractionModule.ExtractToXLSX(input_path + "financial.pdf", output_xlsx_stream);
68				output_xlsx_stream.SetAsInputFilter();
69				output_xlsx_stream.WriteToFile(output_path + "financial.xlsx", false);
70			}
71			catch (PDFNetException e)
72			{
73				Console.WriteLine(e.Message);
74			}
75		}
76
77
78		/// <summary>
79		// The following sample illustrates how to extract document structure from PDF documents.
80		/// </summary>
81		static void TestDocumentStructure()
82		{
83			// Test if the add-on is installed
84			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
85			{
86				Console.WriteLine();
87				Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
88				Console.WriteLine("---------------------------------------------------------------");
89				Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
90				Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
91				Console.WriteLine("module, ensure that the SDK is able to find the required files");
92				Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
93				Console.WriteLine();
94				return;
95			}
96
97			try
98			{
99				// Extract document structure as a JSON file
100				DataExtractionModule.ExtractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
101
102				// Extract document structure as a JSON string
103				string json = DataExtractionModule.ExtractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
104				System.IO.File.WriteAllText(output_path + "tagged.json", json);
105			}
106			catch (PDFNetException e)
107			{
108				Console.WriteLine(e.Message);
109			}
110		}
111
112
113		/// <summary>
114		// The following sample illustrates how to extract form fields from PDF documents.
115		/// </summary>
116		static void TestFormFields()
117		{
118			// Test if the add-on is installed
119			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
120			{
121				Console.WriteLine();
122				Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
123				Console.WriteLine("---------------------------------------------------------------");
124				Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
125				Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
126				Console.WriteLine("module, ensure that the SDK is able to find the required files");
127				Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
128				Console.WriteLine();
129				return;
130			}
131
132			try
133			{
134				// Extract form fields as a JSON file
135				DataExtractionModule.ExtractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
136
137				// Extract form fields as a JSON string
138				string json = DataExtractionModule.ExtractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
139				System.IO.File.WriteAllText(output_path + "formfields.json", json);
140
141				// Detect and add form fields to a PDF document.
142				// PDF document already has form fields, and this sample will update to new found fields.
143				using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
144				{
145					DataExtractionModule.DetectAndAddFormFieldsToPDF(doc);
146					doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveOptions.e_linearized);
147				}
148
149				// Detect and add form fields to a PDF document.
150				// PDF document already has form fields, and this sample will keep the original fields.
151				using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
152				{
153					DataExtractionOptions options = new DataExtractionOptions();
154					options.SetOverlappingFormFieldBehavior("KeepOld");
155
156					DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options);
157					doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveOptions.e_linearized);
158				}
159			}
160			catch (PDFNetException e)
161			{
162				Console.WriteLine(e.Message);
163			}
164		}
165
166		/// <summary>
167		// The following sample illustrates how to extract key-value pairs from PDF documents.
168		/// </summary>
169		static void TestGenericKeyValue()
170		{
171			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value))
172				{
173					Console.WriteLine();
174					Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
175					Console.WriteLine("---------------------------------------------------------------");
176					Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
177					Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
178					Console.WriteLine("module, ensure that the SDK is able to find the required files");
179					Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
180					Console.WriteLine();
181					return;
182				}
183
184			try 
185			{				
186				// Simple example: Extract Keys & Values as a JSON file
187				DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
188
189				// Example with customized options:
190				// Extract Keys & Values from pages 2-4, excluding ads
191				DataExtractionOptions options = new DataExtractionOptions();
192				options.SetPages("2-4");
193
194				RectCollection p2ExclusionZones = new RectCollection();
195				// Exclude the add-on on page 2
196				// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
197				// Coordinates rotate with the page, if it has rotation applied.
198				p2ExclusionZones.AddRect(166, 47, 562, 222);
199				options.AddExclusionZonesForPage(p2ExclusionZones, 2);
200
201				RectCollection p4InclusionZones = new RectCollection();
202				RectCollection p4ExclusionZones = new RectCollection();
203				// Only include the article text for page 4, exclude ads and headings
204				p4InclusionZones.AddRect(30, 432, 562, 684);
205				p4ExclusionZones.AddRect(30, 657, 295, 684);
206				options.AddInclusionZonesForPage(p4InclusionZones, 4);
207				options.AddExclusionZonesForPage(p4ExclusionZones, 4);
208
209				DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
210			}
211			catch (PDFNetException e)
212			{
213				Console.WriteLine(e.Message);
214			}
215		}
216
217
218
219		/// <summary>
220		// The following sample illustrates how to extract document classes from PDF documents.
221		/// </summary>
222		static void TestDocClassifier()
223		{
224			// Test if the add-on is installed
225			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_classification))
226			{
227				Console.WriteLine();
228				Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
229				Console.WriteLine("---------------------------------------------------------------");
230				Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
231				Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
232				Console.WriteLine("module, ensure that the SDK is able to find the required files");
233				Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
234				Console.WriteLine();
235				return;
236			}
237
238			try
239			{
240				// Simple example: classify pages as a JSON file
241				DataExtractionModule.ExtractData(input_path + "Invoice.pdf", output_path + "Invoice_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification);
242
243				// Classify pages as a JSON string
244				string json = DataExtractionModule.ExtractData(input_path + "Scientific_Publication.pdf", DataExtractionModule.DataExtractionEngine.e_doc_classification);
245				System.IO.File.WriteAllText(output_path + "Scientific_Publication_Classified.json", json);
246
247				// Example with customized options:
248				DataExtractionOptions options = new DataExtractionOptions();
249				// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
250				options.SetMinimumConfidenceThreshold(0.7);
251				DataExtractionModule.ExtractData(input_path + "Email.pdf", output_path + "Email_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification, options);
252			}
253			catch (PDFNetException e)
254			{
255				Console.WriteLine(e.Message);
256			}
257		}
258
259
260		/// <summary>
261		/// The main entry point for the application.
262		/// </summary>
263		static void Main(string[] args)
264		{
265			// The first step in every application using PDFNet is to initialize the 
266			// library and set the path to common PDF resources. The library is usually 
267			// initialized only once, but calling Initialize() multiple times is also fine.
268			PDFNet.Initialize(PDFTronLicense.Key);
269			PDFNet.AddResourceSearchPath("../../../../../Lib/");
270
271			TestTabularData();
272			TestDocumentStructure();
273			TestFormFields();
274			TestGenericKeyValue();
275			TestDocClassifier();
276
277			PDFNet.Terminate();
278		}
279	}
280}
281

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30	ofstream out_file(filename.c_str(), ofstream::binary);
31	string out_buf = text.ConvertToUtf8();
32	out_file.write(out_buf.c_str(), out_buf.size());
33	out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45	// Test if the add-on is installed
46	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47	{
48		cout << endl;
49		cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50		cout << "---------------------------------------------------------------" << endl;
51		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52		cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
53		cout << "module, ensure that the SDK is able to find the required files" << endl;
54		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55		return;
56	}
57
58	// Extract tabular data as a JSON file
59	DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61	// Extract tabular data as a JSON string
62	UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63	WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65	// Extract tabular data as an XLSX file
66	DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68	// Extract tabular data as an XLSX stream (also known as filter)
69	MemoryFilter output_xlsx_stream(0, false);
70	DataExtractionOptions options;
71	options.SetPages("1"); // extract page 1
72	DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73	output_xlsx_stream.SetAsInputFilter();
74	output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82	// Test if the add-on is installed
83	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84	{
85		cout << endl;
86		cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87		cout << "---------------------------------------------------------------" << endl;
88		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89		cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
90		cout << "module, ensure that the SDK is able to find the required files" << endl;
91		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92		return;
93	}
94
95	// Extract document structure as a JSON file
96	DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98	// Extract document structure as a JSON string
99	UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100	WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108	// Test if the add-on is installed
109	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110	{
111		cout << endl;
112		cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113		cout << "---------------------------------------------------------------" << endl;
114		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115		cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
116		cout << "module, ensure that the SDK is able to find the required files" << endl;
117		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118		return;
119	}
120
121	// Extract form fields as a JSON file
122	DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124	// Extract form fields as a JSON string
125	UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126	WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128	//---------------------------------------------------------------------------------------
129	// Detect and add form fields to a PDF document.
130	// PDF document already has form fields, and this sample will update to new found fields.
131	//---------------------------------------------------------------------------------------
132	{
133		PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135		DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137		// Save the modfied pdf document
138		doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139	}
140
141	//---------------------------------------------------------------------------------------
142	// Detect and add form fields to a PDF document.
143	// PDF document already has form fields, and this sample will keep the original fields.
144	//---------------------------------------------------------------------------------------
145	{
146		PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148		// Setup DataExtractionOptions to keep old fields
149		DataExtractionOptions options;
150		options.SetOverlappingFormFieldBehavior("KeepOld");
151
152		DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154		// Save the modfied pdf document
155		doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156	}
157}
158
159//---------------------------------------------------------------------------------------
160// The following sample illustrates how to extract key-value pairs from PDF documents.
161//---------------------------------------------------------------------------------------
162void TestGenericKeyValue() {
163
164	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue))
165	{
166		cout << endl;
167		cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
168		cout << "---------------------------------------------------------------" << endl;
169		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
170		cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
171		cout << "module, ensure that the SDK is able to find the required files" << endl;
172		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
173		return;
174	}
175
176	// Simple example: Extract Keys & Values as a JSON file
177	DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val.json"), DataExtractionModule::e_GenericKeyValue);
178
179	// Example with customized options:
180	// Extract Keys & Values from pages 2-4, excluding ads
181	DataExtractionOptions options;
182	options.SetPages("2-4");
183	RectCollection p2_exclusion_zones;
184	// Exclude the add-on on page 2
185	// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
186	// Coordinates rotate with the page, if it has rotation applied.
187	p2_exclusion_zones.AddRect(166, 47, 562, 222);
188	options.AddExclusionZonesForPage(p2_exclusion_zones, 2);
189
190	RectCollection p4_inclusion_zones, p4_exclusion_zones;
191	// Only include the article text for page 4, exclude ads and headings
192	p4_inclusion_zones.AddRect(30, 432, 562, 684);
193	p4_exclusion_zones.AddRect(30, 657, 295, 684);
194	options.AddInclusionZonesForPage(p4_inclusion_zones, 4);
195	options.AddExclusionZonesForPage(p4_exclusion_zones, 4);
196
197	DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val_with_zones.json"), DataExtractionModule::e_GenericKeyValue, &options);
198}
199
200//---------------------------------------------------------------------------------------
201// The following sample illustrates how to extract document classes from PDF documents.
202//---------------------------------------------------------------------------------------
203void TestDocClassifier()
204{
205	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification))
206	{
207		cout << endl;
208		cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
209		cout << "---------------------------------------------------------------" << endl;
210		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
211		cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
212		cout << "module, ensure that the SDK is able to find the required files" << endl;
213		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
214		return;
215	}
216
217	// Simple example: classify pages as a JSON file
218	DataExtractionModule::ExtractData(input_path + UString("Invoice.pdf"), output_path + UString("Invoice_Classified.json"), DataExtractionModule::e_DocClassification);
219
220	// Classify pages as a JSON string
221	UString json = DataExtractionModule::ExtractData(input_path + UString("Scientific_Publication.pdf"), DataExtractionModule::e_DocClassification);
222	WriteTextToFile((output_path + "Scientific_Publication_Classified.json").c_str(), json);
223
224	// Example with customized options:
225	DataExtractionOptions options;
226	// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
227	options.SetMinimumConfidenceThreshold(0.7);
228	DataExtractionModule::ExtractData(input_path + UString("Email.pdf"), output_path + UString("Email_Classified.json"), DataExtractionModule::e_DocClassification, &options);
229}
230
231int main(int argc, char* argv[])
232{
233	// The first step in every application using PDFNet is to initialize the 
234	// library and set the path to common PDF resources. The library is usually 
235	// initialized only once, but calling Initialize() multiple times is also fine.
236	PDFNet::Initialize(LicenseKey);
237
238	int ret = 0;
239
240	try
241	{
242		PDFNet::AddResourceSearchPath("../../../Lib/");
243
244		TestTabularData();
245		TestDocumentStructure();
246		TestFormFields();
247		TestGenericKeyValue();
248		TestDocClassifier();
249	}
250	catch (Common::Exception& e)
251	{
252		cout << e << endl;
253		ret = 1;
254	}
255	catch (...)
256	{
257		cout << "Unknown Exception" << endl;
258		ret = 1;
259	}
260
261	PDFNet::Terminate();
262
263	return ret;
264}
265

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8	"fmt"
9	"testing"
10	"os"
11	"flag"
12	. "github.com/pdftron/pdftron-go/v2"
13)
14
15var licenseKey string
16var modulePath string
17
18func init() {
19    flag.StringVar(&licenseKey, "license", "", "License key for Apryse SDK")
20    flag.StringVar(&modulePath, "modulePath", "", "Path for downloaded modules")
21}
22
23//---------------------------------------------------------------------------------------
24// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
25// extract various types of data from PDF documents.
26//
27// The Apryse SDK Data Extraction suite can be downloaded from
28// https://docs.apryse.com/core/guides/info/modules#data-extraction-module
29//
30// Please contact us if you have any questions.
31//---------------------------------------------------------------------------------------
32
33// Relative path to the folder containing the test files.
34var inputPath = "../TestFiles/"
35var outputPath = "../TestFiles/Output/"
36
37//---------------------------------------------------------------------------------------
38
39func catch(err *error) {
40	if r := recover(); r != nil {
41		*err = fmt.Errorf("%v", r)
42	}
43}
44
45//---------------------------------------------------------------------------------------
46
47func WriteTextToFile(outputFile string, text string) {
48	f, err := os.Create(outputFile)
49	if err != nil {
50		fmt.Println(err)
51	}
52
53	defer f.Close()
54
55	_, err2 := f.WriteString(text)
56	if err2 != nil {
57		fmt.Println(err2)
58	}
59}
60
61//---------------------------------------------------------------------------------------
62// The following sample illustrates how to extract tables from PDF documents.
63//---------------------------------------------------------------------------------------
64
65func TabularDataTest() (err error) {
66	defer catch(&err)
67
68    PDFNetAddResourceSearchPath(modulePath)
69
70	// Test if the add-on is installed
71	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Tabular) {
72		fmt.Println("")
73		fmt.Println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
74		fmt.Println("-----------------------------------------------------------------------------")
75		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
76		fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
77		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
78		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
79		fmt.Println("")
80		return nil
81	}
82
83	// Extract tabular data as a JSON file
84	fmt.Println("Extract tabular data as a JSON file")
85
86	inputFile := inputPath + "table.pdf"
87	outputFile := outputPath + "table.json"
88	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Tabular)
89
90	fmt.Println("Result saved in " + outputFile)
91
92	// Extract tabular data as a JSON string
93	fmt.Println("Extract tabular data as a JSON string")
94
95	inputFile = inputPath + "financial.pdf"
96	outputFile = outputPath + "financial.json"
97
98	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Tabular).(string)
99	WriteTextToFile(outputFile, json)
100
101	fmt.Println("Result saved in " + outputFile)
102
103	// Extract tabular data as an XLSX file
104	fmt.Println("Extract tabular data as an XLSX file")
105
106	inputFile = inputPath + "table.pdf"
107	outputFile = outputPath + "table.xlsx"
108	DataExtractionModuleExtractToXLSX(inputFile, outputFile)
109
110	fmt.Println("Result saved in " + outputFile)
111
112	// Extract tabular data as an XLSX stream (also known as filter)
113	fmt.Println("Extract tabular data as an XLSX stream")
114
115	inputFile = inputPath + "financial.pdf"
116	outputFile = outputPath + "financial.xlsx"
117	outputXlsxStream := NewMemoryFilter(0, false)
118	outputFilter := NewFilter(outputXlsxStream)
119	options := NewDataExtractionOptions()
120	options.SetPages("1"); // page 1
121	DataExtractionModuleExtractToXLSX(inputFile, outputFilter, options)
122	outputXlsxStream.SetAsInputFilter()
123	outputXlsxStream.WriteToFile(outputFile, false)
124
125	fmt.Println("Result saved in " + outputFile)
126
127	return nil
128}
129
130//---------------------------------------------------------------------------------------
131// The following sample illustrates how to extract document structure from PDF documents.
132//---------------------------------------------------------------------------------------
133
134func DocumentStructureTest() (err error) {
135	defer catch(&err)
136
137	// Test if the add-on is installed
138	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocStructure) {
139		fmt.Println("")
140		fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
141		fmt.Println("-----------------------------------------------------------------------------")
142		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
143		fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
144		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
145		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
146		fmt.Println("")
147		return nil
148	}
149
150	// Extract document structure as a JSON file
151	fmt.Println("Extract document structure as a JSON file")
152
153	inputFile := inputPath + "paragraphs_and_tables.pdf"
154	outputFile := outputPath + "paragraphs_and_tables.json"
155	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocStructure)
156
157	fmt.Println("Result saved in " + outputFile)
158
159	// Extract document structure as a JSON string
160	fmt.Println("Extract document structure as a JSON string")
161
162	inputFile = inputPath + "tagged.pdf"
163	outputFile = outputPath + "tagged.json"
164	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocStructure).(string)
165	WriteTextToFile(outputFile, json)
166
167	fmt.Println("Result saved in " + outputFile)
168
169	return nil
170}
171
172//---------------------------------------------------------------------------------------
173// The following sample illustrates how to extract form fields from PDF documents.
174//---------------------------------------------------------------------------------------
175
176func FormFieldsTest() (err error) {
177	defer catch(&err)
178
179	// Test if the add-on is installed
180	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Form) {
181		fmt.Println("")
182		fmt.Println("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
183		fmt.Println("-----------------------------------------------------------------------------")
184		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
185		fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
186		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
187		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
188		fmt.Println("")
189		return nil
190	}
191
192	// Extract form fields as a JSON file
193	fmt.Println("Extract form fields as a JSON file")
194
195	inputFile := inputPath + "formfields-scanned.pdf"
196	outputFile := outputPath + "formfields-scanned.json"
197	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Form)
198
199	fmt.Println("Result saved in " + outputFile)
200
201	// Extract form fields as a JSON string
202	fmt.Println("Extract form fields as a JSON string")
203
204	inputFile = inputPath + "formfields.pdf"
205	outputFile = outputPath + "formfields.json"
206
207	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Form).(string)
208	WriteTextToFile(outputFile, json)
209
210	fmt.Println("Result saved in " + outputFile)
211
212	//////////////////////////////////////////////////////////////////////////
213	// Detect and add form fields to a PDF document.
214	// PDF document already has form fields, and this sample will update to new found fields.
215	doc := NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
216
217	fmt.Println("Extract form fields as a PDF file, keep new fields")
218	DataExtractionModuleDetectAndAddFormFieldsToPDF(doc)
219
220	outputFile = outputPath + "formfields-scanned-fields-new.pdf"
221	doc.Save(outputFile, uint(SDFDocE_linearized))
222	doc.Close()
223
224	fmt.Println("Result saved in " + outputFile)
225
226	//////////////////////////////////////////////////////////////////////////
227	// Detect and add form fields to a PDF document.
228	// PDF document already has form fields, and this sample will keep the original fields.
229	doc = NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
230
231	// Setup DataExtractionOptions to keep old fields
232	options := NewDataExtractionOptions()
233	options.SetOverlappingFormFieldBehavior("KeepOld")
234
235	fmt.Println("Extract form fields as a PDF file, keep old fields")
236	DataExtractionModuleDetectAndAddFormFieldsToPDF(doc, options)
237
238	outputFile = outputPath + "formfields-scanned-fields-old.pdf"
239	doc.Save(outputFile, uint(SDFDocE_linearized))
240	doc.Close()
241
242	fmt.Println("Result saved in " + outputFile)
243
244	return nil
245}
246
247//---------------------------------------------------------------------------------------
248// The following sample illustrates how to extract key-value pairs from PDF documents.
249//---------------------------------------------------------------------------------------
250
251func GenericKeyValueTest() (err error) {
252	defer catch(&err)
253
254	// Test if the add-on is installed
255	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_GenericKeyValue) {
256		fmt.Println("")
257		fmt.Println("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
258		fmt.Println("-----------------------------------------------------------------------------")
259		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
260		fmt.Println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
261		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
262		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
263		fmt.Println("")
264		return nil
265	}
266
267	fmt.Println("Extract key-value pairs from a PDF")
268
269	inputFile := inputPath + "newsletter.pdf"
270	outputFile := outputPath + "newsletter_key_val.json"
271	// Simple example: Extract Keys & Values as a JSON file
272	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue)
273
274	fmt.Println("Result saved in " + outputFile)
275
276	// Example with customized options:
277	// Extract Keys & Values from pages 2-4, excluding ads
278	options := NewDataExtractionOptions()
279	options.SetPages("2-4")
280	
281	p2ExclusionZones := NewRectCollection()
282	// Exclude the add-on on page 2
283	// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
284	// Coordinates rotate with the page, if it has rotation applied.
285	p2ExclusionZones.AddRect(NewRect(166, 47, 562, 222))
286	options.AddExclusionZonesForPage(p2ExclusionZones, 2)
287
288	p4InclusionZones := NewRectCollection()
289	p4ExclusionZones := NewRectCollection()
290	// Only include the article text for page 4, exclude ads and headings
291	p4InclusionZones.AddRect(NewRect(30, 432, 562, 684))
292	p4ExclusionZones.AddRect(NewRect(30, 657, 295, 684))
293	options.AddInclusionZonesForPage(p4InclusionZones, 4)
294	options.AddExclusionZonesForPage(p4ExclusionZones, 4)
295	
296	fmt.Println("Extract Key-Value pairs from specific pages and zones as a JSON file")
297	outputFile = outputPath + "newsletter_key_val_with_zones.json"
298	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue, options)
299
300	fmt.Println("Result saved in " + outputFile)
301
302	return nil
303}
304
305//---------------------------------------------------------------------------------------
306// The following sample illustrates how to extract document classes from PDF documents.
307//---------------------------------------------------------------------------------------
308
309func DocClassifierTest() (err error) {
310	defer catch(&err)
311
312	// Test if the add-on is installed
313	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocClassification) {
314		fmt.Println("")
315		fmt.Println("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
316		fmt.Println("-----------------------------------------------------------------------------")
317		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
318		fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
319		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
320		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
321		fmt.Println("")
322		return nil
323	}
324
325	// Simple example: classify pages as a JSON file
326	fmt.Println("Classify pages as a JSON file")
327
328	inputFile := inputPath + "Invoice.pdf"
329	outputFile := outputPath + "Invoice_Classified.json"
330	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification)
331
332	fmt.Println("Result saved in " + outputFile)
333
334	// Classify pages as a JSON string
335	fmt.Println("Classify pages as a JSON string")
336
337	inputFile = inputPath + "Scientific_Publication.pdf"
338	outputFile = outputPath + "Scientific_Publication_Classified.json"
339	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocClassification).(string)
340	WriteTextToFile(outputFile, json)
341
342	fmt.Println("Result saved in " + outputFile)
343
344	// Example with customized options:
345	fmt.Println("Classify pages with customized options")
346
347	inputFile = inputPath + "Email.pdf"
348	outputFile = outputPath + "Email_Classified.json"
349	options := NewDataExtractionOptions()
350	// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
351	options.SetMinimumConfidenceThreshold(0.7)
352	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocClassification, options)
353
354	fmt.Println("Result saved in " + outputFile)
355
356	return nil
357}
358
359//---------------------------------------------------------------------------------------
360
361func TestDataExtraction(t *testing.T) {
362	// The first step in every application using PDFNet is to initialize the 
363	// library. The library is usually initialized only once, but calling 
364	// Initialize() multiple times is also fine.
365	PDFNetInitialize(licenseKey)
366
367	//-----------------------------------------------------------------------------------
368
369	PDFNetAddResourceSearchPath("../../../PDFNetC/Lib/")
370
371	//-----------------------------------------------------------------------------------
372
373	err := TabularDataTest()
374	if err != nil {
375		fmt.Println(fmt.Errorf("Unable to extract tabular data, error: %s", err))
376	}
377
378	//-----------------------------------------------------------------------------------
379
380	err = DocumentStructureTest()
381	if err != nil {
382		fmt.Println(fmt.Errorf("Unable to extract document structure data, error: %s", err))
383	}
384
385	//-----------------------------------------------------------------------------------
386
387	err = FormFieldsTest()
388	if err != nil {
389		fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
390	}
391
392	err = GenericKeyValueTest()
393	if err != nil {
394		fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
395	}
396
397	//-----------------------------------------------------------------------------------
398
399	err = DocClassifierTest()
400	if err != nil {
401		fmt.Println(fmt.Errorf("Unable to extract document classifications, error: %s", err))
402	}
403
404	//-----------------------------------------------------------------------------------
405
406	PDFNetTerminate()
407	fmt.Println("Done.")
408}
409

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.io.FileWriter;
7import java.io.BufferedWriter;
8import java.io.FileNotFoundException;
9import java.io.IOException;
10
11import com.pdftron.common.PDFNetException;
12import com.pdftron.pdf.*;
13import com.pdftron.filters.*;
14import com.pdftron.sdf.SDFDoc;
15
16//---------------------------------------------------------------------------------------
17// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18// extract various types of data from PDF documents.
19//
20// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
21//---------------------------------------------------------------------------------------
22
23public class DataExtractionTest {
24
25	static void writeTextToFile(String filename, String text) throws IOException
26	{
27		BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
28		writer.write(text);
29		writer.close();
30	}
31
32	//---------------------------------------------------------------------------------------
33	// The following sample illustrates how to extract tables from PDF documents.
34	//---------------------------------------------------------------------------------------
35	static void testTabularData()
36	{
37		try {
38			// Test if the add-on is installed
39			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
40			{
41				System.out.println();
42				System.out.println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
43				System.out.println("---------------------------------------------------------------");
44				System.out.println("The Data Extraction suite is an optional add-on, available for download");
45				System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
46				System.out.println("module, ensure that the SDK is able to find the required files");
47				System.out.println("using the PDFNet.addResourceSearchPath() function." );
48				System.out.println();
49				return;
50			}
51		} catch (PDFNetException e) {
52			System.out.println("Data Extraction module not available, error:");
53			e.printStackTrace();
54			System.out.println(e);
55		}
56
57		// Relative path to the folder containing test files.
58		String input_path = "../../TestFiles/";
59		String output_path = "../../TestFiles/Output/";
60
61		try {
62			// Extract tabular data as a JSON file
63			DataExtractionModule.extractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
64
65			// Extract tabular data as a JSON string
66			String json = DataExtractionModule.extractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
67			writeTextToFile(output_path + "financial.json", json);
68
69			// Extract tabular data as an XLSX file
70			DataExtractionModule.extractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
71
72			// Extract tabular data as an XLSX stream (also known as filter)
73			DataExtractionOptions options = new DataExtractionOptions();
74			options.setPages("1");
75			MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
76			DataExtractionModule.extractToXLSX(input_path + "financial.pdf", output_xlsx_stream, options);
77			output_xlsx_stream.setAsInputFilter();
78			output_xlsx_stream.writeToFile(output_path + "financial.xlsx", false);
79
80		} catch (PDFNetException e) {
81			System.out.println(e);
82		}
83		catch (IOException e) {
84			System.out.println(e);
85		}
86	}
87
88	//---------------------------------------------------------------------------------------
89	// The following sample illustrates how to extract document structure from PDF documents.
90	//---------------------------------------------------------------------------------------
91	static void testDocumentStructure()
92	{
93		// Test if the add-on is installed
94		try {
95			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
96			{
97				System.out.println();
98				System.out.println("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
99				System.out.println("---------------------------------------------------------------");
100				System.out.println("The Data Extraction suite is an optional add-on, available for download");
101				System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
102				System.out.println("module, ensure that the SDK is able to find the required files");
103				System.out.println("using the PDFNet.addResourceSearchPath() function." );
104				System.out.println();
105				return;
106			}
107		} catch (PDFNetException e) {
108			System.out.println("Data Extraction module not available, error:");
109			e.printStackTrace();
110			System.out.println(e);
111		}
112
113		// Relative path to the folder containing test files.
114		String input_path = "../../TestFiles/";
115		String output_path = "../../TestFiles/Output/";
116
117		try {
118			// Extract document structure as a JSON file
119			DataExtractionModule.extractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
120
121			// Extract document structure as a JSON string
122			String json = DataExtractionModule.extractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
123			writeTextToFile(output_path + "tagged.json", json);
124
125		} catch (PDFNetException e) {
126			System.out.println(e);
127		}
128		catch (IOException e) {
129			System.out.println(e);
130		}
131	}
132
133	//---------------------------------------------------------------------------------------
134	// The following sample illustrates how to extract form fields from PDF documents.
135	//---------------------------------------------------------------------------------------
136	static void testFormFields()
137	{
138		try {
139			// Test if the add-on is installed
140			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
141			{
142				System.out.println();
143				System.out.println("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
144				System.out.println("---------------------------------------------------------------");
145				System.out.println("The Data Extraction suite is an optional add-on, available for download");
146				System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
147				System.out.println("module, ensure that the SDK is able to find the required files");
148				System.out.println("using the PDFNet.addResourceSearchPath() function." );
149				System.out.println();
150				return;
151			}
152		} catch (PDFNetException e) {
153			System.out.println("Data Extraction module not available, error:");
154			e.printStackTrace();
155			System.out.println(e);
156		}
157
158		// Relative path to the folder containing test files.
159		String input_path = "../../TestFiles/";
160		String output_path = "../../TestFiles/Output/";
161
162		try {
163			// Extract form fields as a JSON file
164			DataExtractionModule.extractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
165
166			// Extract form fields as a JSON string
167			String json = DataExtractionModule.extractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
168			writeTextToFile(output_path + "formfields.json", json);
169
170			//---------------------------------------------------------------------------------------
171			// Detect and add form fields to a PDF document.
172			// PDF document already has form fields, and this sample will update to new found fields.
173			//---------------------------------------------------------------------------------------
174			try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
175			{
176				DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
177
178				// Save the modfied pdf document
179				doc.save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveMode.LINEARIZED, null);
180			} catch (Exception e) {
181				e.printStackTrace();
182			}
183
184			//---------------------------------------------------------------------------------------
185			// Detect and add form fields to a PDF document.
186			// PDF document already has form fields, and this sample will keep the original fields.
187			//---------------------------------------------------------------------------------------
188			try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
189			{
190				// Setup DataExtractionOptions to keep old fields
191				DataExtractionOptions options = new DataExtractionOptions();
192				options.setOverlappingFormFieldBehavior("KeepOld");
193
194				DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
195
196				// Save the modfied pdf document
197				doc.save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198			} catch (Exception e) {
199				e.printStackTrace();
200			}
201
202		} catch (PDFNetException e) {
203			System.out.println(e);
204		}
205		catch (IOException e) {
206			System.out.println(e);
207		}
208	}
209
210	//---------------------------------------------------------------------------------------
211	// The following sample illustrates how to extract key-value pairs from PDF documents.
212	//---------------------------------------------------------------------------------------
213	public static void testGenericKeyValue() {
214		try {
215			// Test if the add-on is installed
216			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
217			{
218				System.out.println();
219				System.out.println("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
220				System.out.println("---------------------------------------------------------------");
221				System.out.println("The Data Extraction suite is an optional add-on, available for download");
222				System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
223				System.out.println("module, ensure that the SDK is able to find the required files");
224				System.out.println("using the PDFNet.addResourceSearchPath() function." );
225				System.out.println();
226				return;
227			}
228		} catch (PDFNetException e) {
229			System.out.println("Data Extraction module not available, error:");
230			e.printStackTrace();
231			System.out.println(e);
232		}
233
234		// Relative path to the folder containing test files.
235		String input_path = "../../TestFiles/";
236		String output_path = "../../TestFiles/Output/";
237
238		try {
239
240			// Simple example: Extract Keys & Values as a JSON file
241			DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
242
243			// Example with customized options:
244			// Extract Keys & Values from pages 2-4, excluding ads
245			DataExtractionOptions options = new DataExtractionOptions();
246			options.setPages("2-4");
247
248			RectCollection p2ExclusionZones = new RectCollection();
249			// Exclude the add-on on page 2
250			// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
251			// Coordinates rotate with the page, if it has rotation applied.
252			p2ExclusionZones.addRect(166, 47, 562, 222);
253			options.addExclusionZonesForPage(p2ExclusionZones, 2);
254
255			RectCollection p4InclusionZones = new RectCollection();
256			RectCollection p4ExclusionZones = new RectCollection();
257			// Only include the article text for page 4, exclude ads and headings
258			p4InclusionZones.addRect(30, 432, 562, 684);
259			p4ExclusionZones.addRect(30, 657, 295, 684);
260			options.addInclusionZonesForPage(p4InclusionZones, 4);
261			options.addExclusionZonesForPage(p4ExclusionZones, 4);
262
263			DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
264
265		} catch (Exception e) {
266			System.out.println(e);
267		}        
268  }
269
270	//---------------------------------------------------------------------------------------
271	// The following sample illustrates how to extract document classes from PDF documents.
272	//---------------------------------------------------------------------------------------
273	public static void testDocClassifier() {
274		try {
275			// Test if the add-on is installed
276			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_classification))
277			{
278				System.out.println();
279				System.out.println("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
280				System.out.println("---------------------------------------------------------------");
281				System.out.println("The Data Extraction suite is an optional add-on, available for download");
282				System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
283				System.out.println("module, ensure that the SDK is able to find the required files");
284				System.out.println("using the PDFNet.addResourceSearchPath() function." );
285				System.out.println();
286				return;
287			}
288		} catch (PDFNetException e) {
289			System.out.println("Data Extraction module not available, error:");
290			e.printStackTrace();
291			System.out.println(e);
292		}
293
294		// Relative path to the folder containing test files.
295		String input_path = "../../TestFiles/";
296		String output_path = "../../TestFiles/Output/";
297
298		try {
299
300			// Simple example: classify pages as a JSON file
301			DataExtractionModule.extractData(input_path + "Invoice.pdf", output_path + "Invoice_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification);
302
303			// Classify pages as a JSON string
304			String json = DataExtractionModule.extractData(input_path + "Scientific_Publication.pdf", DataExtractionModule.DataExtractionEngine.e_doc_classification);
305			writeTextToFile(output_path + "Scientific_Publication_Classified.json", json);
306
307			// Example with customized options:
308			DataExtractionOptions options = new DataExtractionOptions();
309			// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
310			options.setMinimumConfidenceThreshold(0.7);
311			DataExtractionModule.extractData(input_path + "Email.pdf", output_path + "Email_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification, options);
312
313		} catch (Exception e) {
314			System.out.println(e);
315		}        
316  }
317
318	public static void main(String[] args)
319	{
320		// The first step in every application using PDFNet is to initialize the 
321		// library and set the path to common PDF resources. The library is usually 
322		// initialized only once, but calling initialize() multiple times is also fine.
323		PDFNet.initialize(PDFTronLicense.Key());
324		PDFNet.addResourceSearchPath("../../../Lib/");
325
326		testTabularData();
327		testDocumentStructure();
328		testFormFields();
329		testGenericKeyValue();
330		testDocClassifier();
331
332		PDFNet.terminate();
333	}
334}
335

1 <?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10//---------------------------------------------------------------------------------------
11// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
12// extract various types of data from PDF documents.
13//
14// The Apryse SDK Data Extraction suite can be downloaded from
15// https://docs.apryse.com/core/guides/info/modules
16//
17// Please contact us if you have any questions.
18//---------------------------------------------------------------------------------------
19
20function WriteTextToFile($outputFile, $text)
21{
22	$outfile = fopen($outputFile, "w");
23	fwrite($outfile, $text);
24	fclose($outfile);
25}
26
27function main()
28{
29	// Relative path to the folder containing the test files.
30	$inputPath = getcwd()."/../../TestFiles/";
31	$outputPath = $inputPath."Output/";
32
33	// The first step in every application using PDFNet is to initialize the 
34	// library. The library is usually initialized only once, but calling 
35	// Initialize() multiple times is also fine.
36	global $LicenseKey;
37	PDFNet::Initialize($LicenseKey);
38	PDFNet::GetSystemFontList();    // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
39	
40	//-----------------------------------------------------------------------------------
41
42	PDFNet::AddResourceSearchPath("../../../PDFNetC/Lib/");
43
44	//////////////////////////////////////////////////////////////////////////
45	// The following sample illustrates how to extract tables from PDF documents.
46	//////////////////////////////////////////////////////////////////////////
47
48	// Test if the add-on is installed
49	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular)) {
50		echo(nl2br("\n"));
51		echo(nl2br("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.\n"));
52		echo(nl2br("-----------------------------------------------------------------------------\n"));
53		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
54		echo(nl2br("at https://docs.apryse.com/core/guides/info/modules. If you have already\n"));
55		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
56		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
57		echo(nl2br("\n"));
58	}
59	else {
60		try {
61			// Extract tabular data as a JSON file
62			echo(nl2br("Extract tabular data as a JSON file\n"));
63
64			$outputFile = $outputPath."table.json";
65			DataExtractionModule::ExtractData($inputPath."table.pdf", $outputFile, DataExtractionModule::e_Tabular);
66
67			echo(nl2br("Result saved in " . $outputFile . "\n"));
68
69			///////////////////////////////////////////////////////
70			// Extract tabular data as a JSON string
71			echo(nl2br("Extract tabular data as a JSON string\n"));
72
73			$outputFile = $outputPath."financial.json";
74			$json = DataExtractionModule::ExtractData($inputPath."financial.pdf", DataExtractionModule::e_Tabular);
75			WriteTextToFile($outputFile, $json);
76
77			echo(nl2br("Result saved in " . $outputFile . "\n"));
78
79			///////////////////////////////////////////////////////
80			// Extract tabular data as an XLSX file
81			echo(nl2br("Extract tabular data as an XLSX file\n"));
82
83			$outputFile = $outputPath."table.xlsx";
84			DataExtractionModule::ExtractToXLSX($inputPath."table.pdf", $outputFile);
85
86			echo(nl2br("Result saved in " . $outputFile . "\n"));
87
88			///////////////////////////////////////////////////////
89			// Extract tabular data as an XLSX stream (also known as filter)
90			echo(nl2br("Extract tabular data as an XLSX stream\n"));
91
92			$outputFile = $outputPath."financial.xlsx";
93			$outputXlsxStream = new MemoryFilter(0, false);
94			$options = new DataExtractionOptions();
95			$options->SetPages("1"); // page 1
96			DataExtractionModule::ExtractToXLSX($inputPath."financial.pdf", $outputXlsxStream, $options);
97			$outputXlsxStream->SetAsInputFilter();
98			$outputXlsxStream->WriteToFile($outputFile, false);
99
100			echo(nl2br("Result saved in " . $outputFile . "\n"));
101		}
102		catch(Exception $e) {
103			echo(nl2br("Unable to extract tabular data, error: " . $e->getMessage() . "\n"));
104		}
105	}
106
107	//////////////////////////////////////////////////////////////////////////
108	// The following sample illustrates how to extract document structure from PDF documents.
109	//////////////////////////////////////////////////////////////////////////
110
111	// Test if the add-on is installed
112	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure)) {
113		echo(nl2br("\n"));
114		echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
115		echo(nl2br("-----------------------------------------------------------------------------\n"));
116		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
117		echo(nl2br("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module. If you have already\n"));
118		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
119		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
120		echo(nl2br("\n"));
121	}
122	else {
123		try {
124			// Extract document structure as a JSON file
125			echo(nl2br("Extract document structure as a JSON file\n"));
126
127			$outputFile = $outputPath."paragraphs_and_tables.json";
128			DataExtractionModule::ExtractData($inputPath."paragraphs_and_tables.pdf", $outputFile, DataExtractionModule::e_DocStructure);
129
130			echo(nl2br("Result saved in " . $outputFile . "\n"));
131
132			///////////////////////////////////////////////////////
133			// Extract document structure as a JSON string
134			echo(nl2br("Extract document structure as a JSON string\n"));
135
136			$outputFile = $outputPath."tagged.json";
137			$json = DataExtractionModule::ExtractData($inputPath."tagged.pdf", DataExtractionModule::e_DocStructure);
138			WriteTextToFile($outputFile, $json);
139
140			echo(nl2br("Result saved in " . $outputFile . "\n"));
141		}
142		catch(Exception $e) {
143			echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
144		}
145	}
146
147	//////////////////////////////////////////////////////////////////////////
148	// The following sample illustrates how to extract form fields from PDF documents.
149	//////////////////////////////////////////////////////////////////////////
150
151	// Test if the add-on is installed
152	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form)) {
153		echo(nl2br("\n"));
154		echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.\n"));
155		echo(nl2br("-----------------------------------------------------------------------------\n"));
156		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
157		echo(nl2br("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already\n"));
158		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
159		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
160		echo(nl2br("\n"));
161	}
162	else {
163		try {
164			// Extract form fields as a JSON file
165			echo(nl2br("Extract form fields as a JSON file\n"));
166
167			$outputFile = $outputPath."formfields-scanned.json";
168			DataExtractionModule::ExtractData($inputPath."formfields-scanned.pdf", $outputFile, DataExtractionModule::e_Form);
169
170			echo(nl2br("Result saved in " . $outputFile . "\n"));
171
172			///////////////////////////////////////////////////////
173			// Extract form fields as a JSON string
174			echo(nl2br("Extract form fields as a JSON string\n"));
175
176			$outputFile = $outputPath."formfields.json";
177			$json = DataExtractionModule::ExtractData($inputPath."formfields.pdf", DataExtractionModule::e_Form);
178			WriteTextToFile($outputFile, $json);
179
180			echo(nl2br("Result saved in " . $outputFile . "\n"));
181
182			///////////////////////////////////////////////////////
183			// Detect and add form fields to a PDF document.
184			// PDF document already has form fields, and this sample will update to new found fields.
185			echo(nl2br("Extract form fields as a PDF file\n"));
186
187			$doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
188			DataExtractionModule::DetectAndAddFormFieldsToPDF($doc);
189			$doc->Save($outputPath."formfields-scanned-fields-new.pdf", SDFDoc::e_linearized);
190			$doc->Close();
191
192			echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-new.pdf" . "\n"));
193
194			///////////////////////////////////////////////////////
195			// Detect and add form fields to a PDF document.
196			// PDF document already has form fields, and this sample will keep the original fields.
197			echo(nl2br("Extract form fields as a PDF file\n"));
198			
199			$doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
200			$options = new DataExtractionOptions();
201			$options->SetOverlappingFormFieldBehavior("KeepOld");
202			DataExtractionModule::DetectAndAddFormFieldsToPDF($doc, $options);
203			$doc->Save($outputPath."formfields-scanned-fields-old.pdf", SDFDoc::e_linearized);
204			$doc->Close();
205
206			echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-old.pdf" . "\n"));
207
208		}
209		catch(Exception $e) {
210			echo(nl2br("Unable to extract form fields data, error: " . $e->getMessage() . "\n"));
211		}
212	}
213
214	//////////////////////////////////////////////////////////////////////////
215	// The following sample illustrates how to extract document structure from PDF documents.
216	//////////////////////////////////////////////////////////////////////////
217
218	// Test if the add-on is installed
219	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue)) {
220		echo(nl2br("\n"));
221		echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.\n"));
222		echo(nl2br("-----------------------------------------------------------------------------\n"));
223		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
224		echo(nl2br("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already\n"));
225		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
226		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
227		echo(nl2br("\n"));
228	}
229	else {
230		try {
231			
232			echo(nl2br("Extract key-value pairs from a PDF\n"));
233			// Simple example: Extract Keys & Values as a JSON file
234			$outputFile = $outputPath."newsletter_key_val.json";
235			DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue);
236
237			echo(nl2br("Result saved in " . $outputFile . "\n"));
238
239			// Example with customized options:
240			// Extract Keys & Values from pages 2-4, excluding ads
241			$options = new DataExtractionOptions();
242			$options->setPages("2-4");
243
244			$p2ExclusionZones = new RectCollection();
245			// Exclude the add-on page 2
246			// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
247			// Coordinates rotate with the page, if it has rotation applied.
248			$p2ExclusionZones->AddRect(new Rect(166.0, 47.0, 562.0, 222.0));
249			$options->AddExclusionZonesForPage($p2ExclusionZones, 2);
250
251			$p4InclusionZones = new RectCollection();
252			$p4ExclusionZones = new RectCollection();
253			// Only include the article text for page 4, exclude ads and headings
254			$p4InclusionZones->AddRect(new Rect(30.0, 432.0, 562.0, 684.0));
255			$p4ExclusionZones->AddRect(new Rect(30.0, 657.0, 295.0, 684.0));
256			$options->AddInclusionZonesForPage($p4InclusionZones, 4);
257			$options->AddExclusionZonesForPage($p4ExclusionZones, 4);
258
259			echo(nl2br("Extract Key-Value pairs from specific pages and zones as a JSON file\n"));
260			$outputFile = $outputPath."newsletter_key_val_with_zones.json";
261			DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue, $options);
262
263			echo(nl2br("Result saved in " . $outputFile . "\n"));
264		}
265		catch(Exception $e) {
266			echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
267		}
268	}
269
270	//////////////////////////////////////////////////////////////////////////
271	// The following sample illustrates how to extract document classes from PDF documents.
272	//////////////////////////////////////////////////////////////////////////
273
274	// Test if the add-on is installed
275	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification)) {
276		echo(nl2br("\n"));
277		echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.\n"));
278		echo(nl2br("-----------------------------------------------------------------------------\n"));
279		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
280		echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
281		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
282		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
283		echo(nl2br("\n"));
284	}
285	else {
286		try {
287			// Simple example: classify pages as a JSON file
288			echo(nl2br("Classify pages as a JSON file\n"));
289
290			$outputFile = $outputPath."Invoice_Classified.json";
291			DataExtractionModule::ExtractData($inputPath."Invoice.pdf", $outputFile, DataExtractionModule::e_DocClassification);
292
293			echo(nl2br("Result saved in " . $outputFile . "\n"));
294
295			///////////////////////////////////////////////////////
296			// Classify pages as a JSON string
297			echo(nl2br("Classify pages as a JSON string\n"));
298
299			$outputFile = $outputPath."Scientific_Publication_Classified.json";
300			$json = DataExtractionModule::ExtractData($inputPath."Scientific_Publication.pdf", DataExtractionModule::e_DocClassification);
301			WriteTextToFile($outputFile, $json);
302
303			echo(nl2br("Result saved in " . $outputFile . "\n"));
304
305			///////////////////////////////////////////////////////
306			// Example with customized options:
307			echo(nl2br("Classify pages with customized options\n"));
308
309			$options = new DataExtractionOptions();
310			// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
311			$options->SetMinimumConfidenceThreshold(0.7);
312			$outputFile = $outputPath."Email_Classified.json";
313			DataExtractionModule::ExtractData($inputPath."Email.pdf", $outputFile, DataExtractionModule::e_DocClassification, $options);
314
315			echo(nl2br("Result saved in " . $outputFile . "\n"));
316		}
317		catch(Exception $e) {
318			echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
319		}
320	}
321
322	//-----------------------------------------------------------------------------------
323
324	PDFNet::Terminate();
325	echo(nl2br("Done.\n"));
326}
327
328main();
329?>
330

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
8// extract various types of data from PDF documents.
9//
10// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
11//---------------------------------------------------------------------------------------
12
13const fs = require('fs');
14const { PDFNet } = require('../../lib/pdfnet.js');
15const PDFTronLicense = require('../../LicenseKey/NODEJS/LicenseKey');
16
17((exports) => {
18	'use strict';
19
20	exports.runDataExtractionTest = () => {
21
22		const main = async () => {
23
24			const inputPath = '../TestFiles/';
25			const outputPath = '../TestFiles/Output/';
26
27			//////////////////////////////////////////////////////////////////////////
28
29			await PDFNet.addResourceSearchPath('../../lib/');
30
31			//////////////////////////////////////////////////////////////////////////
32			// The following sample illustrates how to extract tables from PDF documents.
33			//////////////////////////////////////////////////////////////////////////
34
35			// Test if the add-on is installed
36			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular)) {
37				console.log('\nUnable to run Data Extraction: Apryse SDK Tabular Data module not available.');
38				console.log('---------------------------------------------------------------');
39				console.log('The Data Extraction suite is an optional add-on, available for download');
40				console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already');
41				console.log('downloaded this module, ensure that the SDK is able to find the required files');
42				console.log('using the PDFNet.addResourceSearchPath() function.\n');
43			}
44			else
45			{
46				try {
47					// Extract tabular data as a JSON file
48					console.log('Extract tabular data as a JSON file');
49
50					let outputFile = outputPath + 'table.json';
51					await PDFNet.DataExtractionModule.extractData(inputPath + 'table.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
52
53					console.log('Result saved in ' + outputFile);
54
55					///////////////////////////////////////////////////////
56					// Extract tabular data as a JSON string
57					console.log('Extract tabular data as a JSON string');
58
59					outputFile = outputPath + 'financial.json';
60					const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'financial.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
61					fs.writeFileSync(outputFile, json);
62
63					console.log('Result saved in ' + outputFile);
64
65					///////////////////////////////////////////////////////
66					// Extract tabular data as an XLSX file
67					console.log('Extract tabular data as an XLSX file');
68
69					outputFile = outputPath + 'table.xlsx';
70					await PDFNet.DataExtractionModule.extractToXLSX(inputPath + 'table.pdf', outputFile);
71
72					console.log('Result saved in ' + outputFile);
73
74					///////////////////////////////////////////////////////
75					// Extract tabular data as an XLSX stream (also known as filter)
76					console.log('Extract tabular data as an XLSX stream');
77
78					outputFile = outputPath + 'financial.xlsx';
79					const outputXlsxStream = await PDFNet.Filter.createMemoryFilter(0, false);
80					const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
81					options.setPages('1'); // page 1
82					await PDFNet.DataExtractionModule.extractToXLSXWithFilter(inputPath + 'financial.pdf', outputXlsxStream, options);
83					outputXlsxStream.memoryFilterSetAsInputFilter();
84					outputXlsxStream.writeToFile(outputFile, false);
85
86					console.log('Result saved in ' + outputFile);
87				} catch (err) {
88					console.log(err);
89				}
90			}
91
92			//////////////////////////////////////////////////////////////////////////
93			// The following sample illustrates how to extract document structure from PDF documents.
94			//////////////////////////////////////////////////////////////////////////
95
96			// Test if the add-on is installed
97			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure)) {
98				console.log('\nUnable to run Data Extraction: Apryse SDK Structured Output module not available.');
99				console.log('---------------------------------------------------------------');
100				console.log('The Data Extraction suite is an optional add-on, available for download');
101				console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already');
102				console.log('downloaded this module, ensure that the SDK is able to find the required files');
103				console.log('using the PDFNet.addResourceSearchPath() function.\n');
104			}
105			else
106			{
107				try {
108					// Extract document structure as a JSON file
109					console.log('Extract document structure as a JSON file');
110
111					let outputFile = outputPath + 'paragraphs_and_tables.json';
112					await PDFNet.DataExtractionModule.extractData(inputPath + 'paragraphs_and_tables.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
113
114					console.log('Result saved in ' + outputFile);
115
116					///////////////////////////////////////////////////////
117					// Extract document structure as a JSON string
118					console.log('Extract document structure as a JSON string');
119
120					outputFile = outputPath + 'tagged.json';
121					const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'tagged.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
122					fs.writeFileSync(outputFile, json);
123
124					console.log('Result saved in ' + outputFile);
125				} catch (err) {
126					console.log(err);
127				}
128			}
129
130			//////////////////////////////////////////////////////////////////////////
131			// The following sample illustrates how to extract form fields from PDF documents.
132			//////////////////////////////////////////////////////////////////////////
133
134			// Test if the add-on is installed
135			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Form)) {
136				console.log('\nUnable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.');
137				console.log('---------------------------------------------------------------');
138				console.log('The Data Extraction suite is an optional add-on, available for download');
139				console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already');
140				console.log('downloaded this module, ensure that the SDK is able to find the required files');
141				console.log('using the PDFNet.addResourceSearchPath() function.\n');
142			}
143			else
144			{
145				try {
146					// Extract form fields as a JSON file
147					console.log('Extract form fields as a JSON file');
148
149					let outputFile = outputPath + 'formfields-scanned.json';
150					await PDFNet.DataExtractionModule.extractData(inputPath + 'formfields-scanned.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
151
152					console.log('Result saved in ' + outputFile);
153
154					///////////////////////////////////////////////////////
155					// Extract form fields as a JSON string
156					console.log('Extract form fields as a JSON string');
157
158					outputFile = outputPath + 'formfields.json';
159					const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'formfields.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
160					fs.writeFileSync(outputFile, json);
161
162					console.log('Result saved in ' + outputFile);
163
164					//////////////////////////////////////////////////////////////////////////
165					// Detect and add form fields to a PDF document.
166					// Document already has form fields, and this sample will update to new found fields.
167					{
168						console.log('Detect and add form fields in a PDF file, keep new fields');
169
170						const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
171
172						await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
173						outputFile = outputPath + 'formfields-scanned-fields-new.pdf';
174						await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
175
176						console.log('Result saved in ' + outputFile);
177					}
178
179					//////////////////////////////////////////////////////////////////////////
180					// Detect and add form fields to a PDF document.
181					// Document already has form fields, and this sample will keep the original fields.
182					{
183						console.log('Detect and add form fields in a PDF file, keep old fields');
184
185						const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
186
187						const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
188						options.setOverlappingFormFieldBehavior('KeepOld');
189
190						await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
191						outputFile = outputPath + 'formfields-scanned-fields-old.pdf';
192						await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
193					}
194
195					console.log('Result saved in ' + outputFile);
196
197				} catch (err) {
198					console.log(err);
199				}
200			}
201
202			//////////////////////////////////////////////////////////////////////////
203			// The following sample illustrates how to extract key-value pairs from PDF documents.
204			//////////////////////////////////////////////////////////////////////////
205			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue)) {
206				console.log();
207				console.log('Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.');
208				console.log('---------------------------------------------------------------');
209				console.log('The Data Extraction suite is an optional add-on, available for download');
210				console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this');
211				console.log('module, ensure that the SDK is able to find the required files');
212				console.log('using the PDFNet.addResourceSearchPath() function.');
213				console.log();
214			}
215			else
216			{
217				try {
218					// Simple example: Extract Keys & Values as a JSON file
219					console.log('Extract Key-Value pairs as a JSON file');
220					await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue);
221					console.log('Result saved in ' + outputPath + 'newsletter_key_val.json');
222					
223					const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
224					options.setPages('2-4');
225				
226					const p2ExclusionZones = [];
227					// Exclude the add-on page 2
228					// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
229					// Coordinates rotate with the page, if it has rotation applied.
230					p2ExclusionZones.push(new PDFNet.Rect(166, 47, 562, 222));
231					options.addExclusionZonesForPage(p2ExclusionZones, 2);
232				
233					const p4InclusionZones = [];
234					const p4ExclusionZones = [];
235					// Only include the article text for page 4, exclude ads and headings
236					p4InclusionZones.push(new PDFNet.Rect(30, 432, 562, 684));
237					p4ExclusionZones.push(new PDFNet.Rect(30, 657, 295, 684));
238					options.addInclusionZonesForPage(p4InclusionZones, 4);
239					options.addExclusionZonesForPage(p4ExclusionZones, 4);
240					console.log('Extract Key-Value pairs from specific pages and zones as a JSON file');
241					await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val_with_zones.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue, options);
242					console.log('Result saved in ' + outputPath + 'newsletter_key_val_with_zones.json');
243				} catch (err) {
244					console.log(err);
245				}
246			}
247
248			//////////////////////////////////////////////////////////////////////////
249			// The following sample illustrates how to extract document classes from PDF documents.
250			//////////////////////////////////////////////////////////////////////////
251
252			// Test if the add-on is installed
253			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification)) {
254				console.log('\nUnable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.');
255				console.log('---------------------------------------------------------------');
256				console.log('The Data Extraction suite is an optional add-on, available for download');
257				console.log('at https://docs.apryse.com/documentation/core/info/modules/. If you have already');
258				console.log('downloaded this module, ensure that the SDK is able to find the required files');
259				console.log('using the PDFNet.addResourceSearchPath() function.\n');
260			}
261			else
262			{
263				try {
264					// Simple example: classify pages as a JSON file
265					console.log('Classify pages as a JSON file');
266
267					let outputFile = outputPath + 'Invoice_Classified.json';
268					await PDFNet.DataExtractionModule.extractData(inputPath + 'Invoice.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification);
269
270					console.log('Result saved in ' + outputFile);
271
272					///////////////////////////////////////////////////////
273					// Classify pages as a JSON string
274					console.log('Classify pages as a JSON string');
275
276					outputFile = outputPath + 'Scientific_Publication_Classified.json';
277					const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'Scientific_Publication.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification);
278					fs.writeFileSync(outputFile, json);
279
280					console.log('Result saved in ' + outputFile);
281					
282					///////////////////////////////////////////////////////
283					// Example with customized options:
284					console.log('Classify pages with customized options');
285
286					const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
287					// Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
288					options.setMinimumConfidenceThreshold(0.7);
289					outputFile = outputPath + 'Email_Classified.json';
290					await PDFNet.DataExtractionModule.extractData(inputPath + 'Email.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocClassification, options);
291
292					console.log('Result saved in ' + outputFile);
293					
294				} catch (err) {
295					console.log(err);
296				}
297			}
298
299			//////////////////////////////////////////////////////////////////////////
300
301			console.log('Done.');
302		};
303
304		PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) {
305			console.log('Error: ' + JSON.stringify(error));
306		}).then(function () { return PDFNet.shutdown(); });
307	};
308	exports.runDataExtractionTest();
309})(exports);
310// eslint-disable-next-line spaced-comment
311//# sourceURL=DataExtractionTest.js
312

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11import platform
12
13sys.path.append("../../LicenseKey/PYTHON")
14from LicenseKey import *
15
16#---------------------------------------------------------------------------------------
17# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18# extract various types of data from PDF documents.
19#
20# The Apryse SDK Data Extraction suite can be downloaded from
21# https://docs.apryse.com/core/guides/info/modules#data-extraction-module
22#
23# Please contact us if you have any questions.
24#---------------------------------------------------------------------------------------
25
26# Relative path to the folder containing the test files.
27inputPath = "../../TestFiles/"
28outputPath = "../../TestFiles/Output/"
29
30def WriteTextToFile(outputFile, text):
31    # Write the contents of text to the disk
32    f = open(outputFile, "w")
33    try:
34        f.write(text)
35    finally:
36        f.close()
37
38def main():
39    # The first step in every application using PDFNet is to initialize the 
40    # library. The library is usually initialized only once, but calling 
41    # Initialize() multiple times is also fine.
42    PDFNet.Initialize(LicenseKey)
43    
44    PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
45
46    #-----------------------------------------------------------------------------------
47    # The following sample illustrates how to extract tables from PDF documents.
48    #-----------------------------------------------------------------------------------
49
50    # Test if the add-on is installed
51    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Tabular):
52        print("")
53        print("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
54        print("-----------------------------------------------------------------------------")
55        print("The Data Extraction suite is an optional add-on, available for download")
56        print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
57        print("downloaded this module, ensure that the SDK is able to find the required files")
58        print("using the PDFNet.AddResourceSearchPath() function.")
59        print("")
60    else:
61        try:
62            # Extract tabular data as a JSON file
63            print("Extract tabular data as a JSON file")
64
65            outputFile = outputPath + "table.json"
66            DataExtractionModule.ExtractData(inputPath + "table.pdf", outputFile, DataExtractionModule.e_Tabular)
67
68            print("Result saved in " + outputFile)
69
70            #------------------------------------------------------
71            # Extract tabular data as a JSON string
72            print("Extract tabular data as a JSON string")
73
74            outputFile = outputPath + "financial.json"
75            json = DataExtractionModule.ExtractData(inputPath + "financial.pdf", DataExtractionModule.e_Tabular)
76            WriteTextToFile(outputFile, json)
77
78            print("Result saved in " + outputFile)
79
80            #------------------------------------------------------
81            # Extract tabular data as an XLSX file
82            print("Extract tabular data as an XLSX file")
83
84            outputFile = outputPath + "table.xlsx"
85            DataExtractionModule.ExtractToXLSX(inputPath + "table.pdf", outputFile)
86
87            print("Result saved in " + outputFile)
88
89            #------------------------------------------------------
90            # Extract tabular data as an XLSX stream (also known as filter)
91            print("Extract tabular data as an XLSX stream")
92
93            outputFile = outputPath + "financial.xlsx"
94            options = DataExtractionOptions()
95            options.SetPages("1") # page 1
96            outputXlsxStream = MemoryFilter(0, False)
97            DataExtractionModule.ExtractToXLSX(inputPath + "financial.pdf", outputXlsxStream, options)
98            outputXlsxStream.SetAsInputFilter()
99            outputXlsxStream.WriteToFile(outputFile, False)
100
101            print("Result saved in " + outputFile)
102        except Exception as e:
103            print("Unable to extract tabular data, error: " + str(e))
104
105    #-----------------------------------------------------------------------------------
106    # The following sample illustrates how to extract document structure from PDF documents.
107    #-----------------------------------------------------------------------------------
108
109    # Test if the add-on is installed
110    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocStructure):
111        print("")
112        print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
113        print("-----------------------------------------------------------------------------")
114        print("The Data Extraction suite is an optional add-on, available for download")
115        print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
116        print("downloaded this module, ensure that the SDK is able to find the required files")
117        print("using the PDFNet.AddResourceSearchPath() function.")
118        print("")
119    else:
120        try:
121            # Extract document structure as a JSON file
122            print("Extract document structure as a JSON file")
123
124            outputFile = outputPath + "paragraphs_and_tables.json"
125            DataExtractionModule.ExtractData(inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule.e_DocStructure)
126
127            print("Result saved in " + outputFile)
128
129            #------------------------------------------------------
130            # Extract document structure as a JSON string
131            print("Extract document structure as a JSON string")
132
133            outputFile = outputPath + "tagged.json"
134            json = DataExtractionModule.ExtractData(inputPath + "tagged.pdf", DataExtractionModule.e_DocStructure)
135            WriteTextToFile(outputFile, json)
136
137            print("Result saved in " + outputFile)
138        except Exception as e:
139            print("Unable to extract document structure data, error: " + str(e))
140
141    #-----------------------------------------------------------------------------------
142    # The following sample illustrates how to extract form fields from PDF documents.
143    #-----------------------------------------------------------------------------------
144
145    # Test if the add-on is installed
146    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Form):
147        print("")
148        print("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
149        print("-----------------------------------------------------------------------------")
150        print("The Data Extraction suite is an optional add-on, available for download")
151        print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
152        print("downloaded this module, ensure that the SDK is able to find the required files")
153        print("using the PDFNet.AddResourceSearchPath() function.")
154        print("")
155    else:
156        try:
157            # Extract form fields as a JSON file
158            print("Extract form fields as a JSON file")
159
160            outputFile = outputPath + "formfields-scanned.json"
161            DataExtractionModule.ExtractData(inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule.e_Form)
162
163            print("Result saved in " + outputFile)
164
165            #------------------------------------------------------
166            # Extract form fields as a JSON string
167            print("Extract form fields as a JSON string")
168
169            outputFile = outputPath + "formfields.json"
170            json = DataExtractionModule.ExtractData(inputPath + "formfields.pdf", DataExtractionModule.e_Form)
171            WriteTextToFile(outputFile, json)
172
173            print("Result saved in " + outputFile)
174
175            #-----------------------------------------------------------------------------------
176            # Detect and add form fields to a PDF document.
177            # PDF document already has form fields, and this sample will update to new found fields.
178            print("Extract form fields as a pdf file, update to new")
179
180            doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
181            
182            DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
183            
184            outputFile = outputPath + "formfields-scanned-fields-new.pdf"
185            doc.Save(outputFile, SDFDoc.e_linearized)
186            doc.Close()
187            
188            print("Result saved in " + outputFile)
189
190            #-----------------------------------------------------------------------------------
191            # Detect and add form fields to a PDF document.
192            # PDF document already has form fields, and this sample will keep the original fields.
193            print("Extract form fields as a pdf file, keep original")
194
195            doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
196            
197            options = DataExtractionOptions()
198            options.SetOverlappingFormFieldBehavior("KeepOld")
199            DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
200            
201            outputFile = outputPath + "formfields-scanned-fields-old.pdf"
202            doc.Save(outputFile, SDFDoc.e_linearized)
203            doc.Close()
204            
205            print("Result saved in " + outputFile)
206
207        except Exception as e:
208            print("Unable to extract form fields data, error: " + str(e))
209
210    #---------------------------------------------------------------------------------------
211    # The following sample illustrates how to extract key-value pairs from PDF documents.
212    #---------------------------------------------------------------------------------------
213    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_GenericKeyValue):
214        print()
215        print("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
216        print("---------------------------------------------------------------")
217        print("The Data Extraction suite is an optional add-on, available for download")
218        print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
219        print("module, ensure that the SDK is able to find the required files")
220        print("using the PDFNet.AddResourceSearchPath() function.")
221        print()
222    else:
223        try:
224            print("Extract key-value pairs from a PDF")
225            # Simple example: Extract Keys & Values as a JSON file
226            DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val.json", DataExtractionModule.e_GenericKeyValue)
227            print("Result saved in " + outputPath + "newsletter_key_val.json")
228
229            # Example with customized options:
230            # Extract Keys & Values from pages 2-4, excluding ads
231            options = DataExtractionOptions()
232            options.SetPages("2-4")
233
234            p2_exclusion_zones = RectCollection()
235            # Exclude the add-on on page 2
236            # These coordinates are in PDF user space, with the origin at the bottom left corner of the page
237            # Coordinates rotate with the page, if it has rotation applied.
238            p2_exclusion_zones.AddRect(Rect(166, 47, 562, 222))
239            options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
240
241            p4_inclusion_zones = RectCollection()
242            p4_exclusion_zones = RectCollection()
243            # Only include the article text for page 4, exclude ads and headings
244            p4_inclusion_zones.AddRect(Rect(30, 432, 562, 684))
245            p4_exclusion_zones.AddRect(Rect(30, 657, 295, 684))
246            options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
247            options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
248            print("Extract Key-Value pairs from specific pages and zones as a JSON file")
249            DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule.e_GenericKeyValue, options)
250            print("Result saved in " + outputPath + "newsletter_key_val_with_zones.json")
251        except Exception as e:
252                print("Unable to extract key-value data, error: " + str(e))
253
254
255    #-----------------------------------------------------------------------------------
256    # The following sample illustrates how to extract document classes from PDF documents.
257    #-----------------------------------------------------------------------------------
258
259    # Test if the add-on is installed
260    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocClassification):
261        print("")
262        print("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
263        print("-----------------------------------------------------------------------------")
264        print("The Data Extraction suite is an optional add-on, available for download")
265        print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
266        print("downloaded this module, ensure that the SDK is able to find the required files")
267        print("using the PDFNet.AddResourceSearchPath() function.")
268        print("")
269    else:
270        try:
271            # Simple example: classify pages as a JSON file
272            print("Classify pages as a JSON file")
273
274            outputFile = outputPath + "Invoice_Classified.json"
275            DataExtractionModule.ExtractData(inputPath + "Invoice.pdf", outputFile, DataExtractionModule.e_DocClassification)
276
277            print("Result saved in " + outputFile)
278
279            #------------------------------------------------------
280            # Classify pages as a JSON string
281            print("Classify pages as a JSON string")
282
283            outputFile = outputPath + "Scientific_Publication_Classified.json"
284            json = DataExtractionModule.ExtractData(inputPath + "Scientific_Publication.pdf", DataExtractionModule.e_DocClassification)
285            WriteTextToFile(outputFile, json)
286
287            print("Result saved in " + outputFile)
288
289            #------------------------------------------------------
290            # Example with customized options:
291            print("Classify pages with customized options")
292
293            options = DataExtractionOptions()
294            # Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
295            options.SetMinimumConfidenceThreshold(0.7)
296            outputFile = outputPath + "Email_Classified.json"
297            DataExtractionModule.ExtractData(inputPath + "Email.pdf", outputFile, DataExtractionModule.e_DocClassification, options)
298
299            print("Result saved in " + outputFile)
300
301        except Exception as e:
302            print("Unable to extract document structure data, error: " + str(e))
303
304    PDFNet.Terminate()
305    print("Done.")
306    
307if __name__ == '__main__':
308    main()
309

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
14# extract various types of data from PDF documents.
15#
16# The Apryse SDK Data Extraction suite can be downloaded from
17# https://docs.apryse.com/core/guides/info/modules#data-extraction-module
18#
19# Please contact us if you have any questions.
20#---------------------------------------------------------------------------------------
21
22# Relative path to the folder containing the test files.
23$inputPath = "../../TestFiles/"
24$outputPath = "../../TestFiles/Output/"
25	
26def main()
27	# The first step in every application using PDFNet is to initialize the 
28	# library. The library is usually initialized only once, but calling 
29	# Initialize() multiple times is also fine.
30	PDFNet.Initialize(PDFTronLicense.Key)
31
32	PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
33
34	#-----------------------------------------------------------------------------------
35	# The following sample illustrates how to extract tables from PDF documents.
36	#-----------------------------------------------------------------------------------
37
38	# Test if the add-on is installed
39	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Tabular) then
40		puts ""
41		puts "Unable to run Data Extraction: Apryse SDK Tabular Data module not available."
42		puts "-----------------------------------------------------------------------------"
43		puts "The Data Extraction suite is an optional add-on, available for download"
44		puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
45		puts "downloaded this module, ensure that the SDK is able to find the required files"
46		puts "using the PDFNet.AddResourceSearchPath() function."
47		puts ""
48	else
49		begin
50			# Extract tabular data as a JSON file
51			puts "Extract tabular data as a JSON file"
52	
53			outputFile = $outputPath + "table.json"
54			DataExtractionModule.ExtractData($inputPath + "table.pdf", outputFile, DataExtractionModule::E_Tabular)
55
56			puts "Result saved in " + outputFile
57
58			#------------------------------------------------------
59			# Extract tabular data as a JSON string
60			puts "Extract tabular data as a JSON string"
61	
62			outputFile = $outputPath + "financial.json"
63			json = DataExtractionModule.ExtractData($inputPath + "financial.pdf", DataExtractionModule::E_Tabular)
64			File.open(outputFile, 'w') { |file| file.write(json) }
65	
66			puts "Result saved in " + outputFile
67
68			#------------------------------------------------------
69			# Extract tabular data as an XLSX file
70			puts "Extract tabular data as an XLSX file"
71	
72			outputFile = $outputPath + "table.xlsx"
73			DataExtractionModule.ExtractToXLSX($inputPath + "table.pdf", outputFile)
74	
75			puts "Result saved in " + outputFile
76
77			#------------------------------------------------------
78			# Extract tabular data as an XLSX stream (also known as filter)
79			puts "Extract tabular data as an XLSX stream"
80	
81			outputFile = $outputPath + "financial.xlsx"
82			outputXlsxStream = MemoryFilter.new(0, false)
83			options = DataExtractionOptions.new()
84			options.SetPages("1") # page 1
85			DataExtractionModule.ExtractToXLSX($inputPath + "financial.pdf", outputXlsxStream, options)
86			outputXlsxStream.SetAsInputFilter()
87			outputXlsxStream.WriteToFile(outputFile, false)
88	
89			puts "Result saved in " + outputFile
90		rescue => error
91			puts "Unable to extract tabular data, error: " + error.message
92		end
93	end
94
95	#-----------------------------------------------------------------------------------
96	# The following sample illustrates how to extract document structure from PDF documents.
97	#-----------------------------------------------------------------------------------
98
99	# Test if the add-on is installed
100	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocStructure) then
101		puts ""
102		puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
103		puts "-----------------------------------------------------------------------------"
104		puts "The Data Extraction suite is an optional add-on, available for download"
105		puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
106		puts "downloaded this module, ensure that the SDK is able to find the required files"
107		puts "using the PDFNet.AddResourceSearchPath() function."
108		puts ""
109	else
110		begin
111			# Extract document structure as a JSON file
112			puts "Extract document structure as a JSON file"
113	
114			outputFile = $outputPath + "paragraphs_and_tables.json"
115			DataExtractionModule.ExtractData($inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule::E_DocStructure)
116
117			puts "Result saved in " + outputFile
118
119			#------------------------------------------------------
120			# Extract document structure as a JSON string
121			puts "Extract document structure as a JSON string"
122	
123			outputFile = $outputPath + "tagged.json"
124			json = DataExtractionModule.ExtractData($inputPath + "tagged.pdf", DataExtractionModule::E_DocStructure)
125			File.open(outputFile, 'w') { |file| file.write(json) }
126	
127			puts "Result saved in " + outputFile
128		rescue => error
129			puts "Unable to extract document structure data, error: " + error.message
130		end
131	end
132
133	#-----------------------------------------------------------------------------------
134	# The following sample illustrates how to extract form fields from PDF documents.
135	#-----------------------------------------------------------------------------------
136
137	# Test if the add-on is installed
138	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Form) then
139		puts ""
140		puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
141		puts "-----------------------------------------------------------------------------"
142		puts "The Data Extraction suite is an optional add-on, available for download"
143		puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
144		puts "downloaded this module, ensure that the SDK is able to find the required files"
145		puts "using the PDFNet.AddResourceSearchPath() function."
146		puts ""
147	else
148		begin
149			# Extract form fields as a JSON file
150			puts "Extract form fields as a JSON file"
151	
152			outputFile = $outputPath + "formfields-scanned.json"
153			DataExtractionModule.ExtractData($inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule::E_Form)
154
155			puts "Result saved in " + outputFile
156
157			#------------------------------------------------------
158			# Extract form fields as a JSON string
159			puts "Extract form fields as a JSON string"
160	
161			outputFile = $outputPath + "formfields.json"
162			json = DataExtractionModule.ExtractData($inputPath + "formfields.pdf", DataExtractionModule::E_Form)
163			File.open(outputFile, 'w') { |file| file.write(json) }
164	
165			puts "Result saved in " + outputFile
166			
167			#-----------------------------------------------------------------------------------
168			# Detect and add form fields to a PDF document.
169			# PDF document already has form fields, and this sample will update to the new fields.
170			puts "Extract document structure as a PDF file"
171			doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
172	
173			outputFile = $outputPath + "formfields-scanned-fields-new.pdf"
174			
175			DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
176			doc.Save(outputFile, SDFDoc::E_linearized);
177			doc.Close
178
179			puts "Result saved in " + outputFile
180
181			#-----------------------------------------------------------------------------------
182			# Detect and add form fields to a PDF document.
183			# PDF document already has form fields, and this sample will keep the original fields.
184			puts "Extract document structure as a PDF file"
185			doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
186	
187			outputFile = $outputPath + "formfields-scanned-fields-old.pdf"
188			
189			options = DataExtractionOptions.new()
190			options.SetOverlappingFormFieldBehavior("KeepOld")
191			DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
192			doc.Save(outputFile, SDFDoc::E_linearized);
193			doc.Close
194
195			puts "Result saved in " + outputFile
196
197
198		rescue => error
199			puts "Unable to extract form fields data, error: " + error.message
200		end
201	end
202
203	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_GenericKeyValue) then
204		puts ""
205		puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
206		puts "-----------------------------------------------------------------------------"
207		puts "The Data Extraction suite is an optional add-on, available for download"
208		puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
209		puts "downloaded this module, ensure that the SDK is able to find the required files"
210		puts "using the PDFNet.AddResourceSearchPath() function."
211		puts ""
212	else
213		begin
214        	puts "Extract key-value pairs from a PDF"
215			# Simple example: Extract Keys & Values as a JSON file
216			DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val.json", DataExtractionModule::E_GenericKeyValue)
217			puts "Result saved in " + $outputPath + "newsletter_key_val.json"
218
219			# Example with customized options:
220			# Extract Keys & Values from pages 2-4, excluding ads
221			options = DataExtractionOptions.new()
222			options.SetPages("2-4")
223
224			p2_exclusion_zones = RectCollection.new()
225			# Exclude the add-on on page 2
226			# These coordinates are in PDF user space, with the origin at the bottom left corner of the page
227			# Coordinates rotate with the page, if it has rotation applied.
228			p2_exclusion_zones.AddRect(Rect.new(166, 47, 562, 222))
229			options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
230
231			p4_inclusion_zones = RectCollection.new()
232			p4_exclusion_zones = RectCollection.new()
233			# Only include the article text for page 4, exclude ads and headings
234			p4_inclusion_zones.AddRect(Rect.new(30, 432, 562, 684))
235			p4_exclusion_zones.AddRect(Rect.new(30, 657, 295, 684))
236			options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
237			options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
238			puts "Extract Key-Value pairs from specific pages and zones as a JSON file"
239			DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule::E_GenericKeyValue, options)
240			puts "Result saved in " + $outputPath + "newsletter_key_val_with_zones.json"
241
242		rescue => error
243			puts "Unable to extract form fields data, error: " + error.message
244		end
245	end
246
247	#-----------------------------------------------------------------------------------
248	# The following sample illustrates how to extract document classes from PDF documents.
249	#-----------------------------------------------------------------------------------
250
251	# Test if the add-on is installed
252	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocClassification) then
253		puts ""
254		puts "Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available."
255		puts "-----------------------------------------------------------------------------"
256		puts "The Data Extraction suite is an optional add-on, available for download"
257		puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
258		puts "downloaded this module, ensure that the SDK is able to find the required files"
259		puts "using the PDFNet.AddResourceSearchPath() function."
260		puts ""
261	else
262		begin
263			# Simple example: classify pages as a JSON file
264			puts "Classify pages as a JSON file"
265	
266			outputFile = $outputPath + "Invoice_Classified.json"
267			DataExtractionModule.ExtractData($inputPath + "Invoice.pdf", outputFile, DataExtractionModule::E_DocClassification)
268
269			puts "Result saved in " + outputFile
270
271			#------------------------------------------------------
272			# Classify pages as a JSON string
273			puts "Classify pages as a JSON string"
274	
275			outputFile = $outputPath + "Scientific_Publication_Classified.json"
276			json = DataExtractionModule.ExtractData($inputPath + "Scientific_Publication.pdf", DataExtractionModule::E_DocClassification)
277			File.open(outputFile, 'w') { |file| file.write(json) }
278	
279			puts "Result saved in " + outputFile
280
281			#------------------------------------------------------
282			# Example with customized options:
283			puts "Classify pages with customized options"
284	
285			options = DataExtractionOptions.new()
286			# Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
287			options.SetMinimumConfidenceThreshold(0.7)
288			outputFile = $outputPath + "Email_Classified.json"
289			DataExtractionModule.ExtractData($inputPath + "Email.pdf", outputFile, DataExtractionModule::E_DocClassification, options)
290
291			puts "Result saved in " + outputFile
292			
293		rescue => error
294			puts "Unable to extract document structure data, error: " + error.message
295		end
296	end
297
298	#-----------------------------------------------------------------------------------
299
300	PDFNet.Terminate
301	puts "Done."
302end
303
304main()
305

1'
2' Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports pdftron
6Imports pdftron.Common
7Imports pdftron.PDF
8Imports pdftron.Filters
9
10' The Data Extraction suite is an optional PDFNet add-on collection that can be used to
11' extract various types of data from PDF documents.
12' The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
13
14Module DataExtractionTestVB
15	Dim pdfNetLoader As PDFNetLoader
16	Sub New()
17		pdfNetLoader = pdftron.PDFNetLoader.Instance()
18	End Sub
19
20	' Relative path to the folder containing test files.
21	Dim input_path As String = "../../../../TestFiles/"
22	Dim output_path As String = "../../../../TestFiles/Output/"
23
24	Sub Main()
25		PDFNet.Initialize(PDFTronLicense.Key)
26		PDFNet.AddResourceSearchPath("../../../../../Lib/")
27
28		TestTabularData()
29		TestDocumentStructure()
30		TestFormFields()
31		TestGenericKeyValue()
32		TestDocClassifier()
33
34		PDFNet.Terminate()
35	End Sub
36
37
38	' The following sample illustrates how to extract tables from PDF documents.
39	Sub TestTabularData()
40		' Test if the add-on is installed
41		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular) Then
42			Console.WriteLine()
43			Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
44			Console.WriteLine("---------------------------------------------------------------")
45			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
46			Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
47			Console.WriteLine("module, ensure that the SDK is able to find the required files")
48			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
49			Console.WriteLine()
50			Return
51		End If
52
53		Try
54			' Extract tabular data as a JSON file
55			DataExtractionModule.ExtractData(input_path & "table.pdf", output_path & "table.json", DataExtractionModule.DataExtractionEngine.e_tabular)
56
57			' Extract tabular data as a JSON string
58			Dim json As String = DataExtractionModule.ExtractData(input_path & "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular)
59			System.IO.File.WriteAllText(output_path & "financial.json", json)
60
61			' Extract tabular data as an XLSX file
62			DataExtractionModule.ExtractToXLSX(input_path & "table.pdf", output_path & "table.xlsx")
63
64			' Extract tabular data as an XLSX stream (also known as filter)
65			Dim output_xlsx_stream As MemoryFilter = New MemoryFilter(0, False)
66			DataExtractionModule.ExtractToXLSX(input_path & "financial.pdf", output_xlsx_stream)
67			output_xlsx_stream.SetAsInputFilter()
68			output_xlsx_stream.WriteToFile(output_path & "financial.xlsx", False)
69
70		Catch e As PDFNetException
71			Console.WriteLine(e.Message)
72		End Try
73	End Sub
74
75
76	' The following sample illustrates how to extract document structure from PDF documents.
77	Sub TestDocumentStructure()
78		' Test if the add-on is installed
79		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure) Then
80			Console.WriteLine()
81			Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.")
82			Console.WriteLine("---------------------------------------------------------------")
83			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
84			Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
85			Console.WriteLine("module, ensure that the SDK is able to find the required files")
86			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
87			Console.WriteLine()
88			Return
89		End If
90
91		Try
92			' Extract document structure as a JSON file
93			DataExtractionModule.ExtractData(input_path & "paragraphs_and_tables.pdf", output_path & "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure)
94
95			' Extract document structure as a JSON string
96			Dim json As String = DataExtractionModule.ExtractData(input_path & "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure)
97			System.IO.File.WriteAllText(output_path & "tagged.json", json)
98
99		Catch e As PDFNetException
100			Console.WriteLine(e.Message)
101		End Try
102	End Sub
103
104
105	' The following sample illustrates how to extract form fields from PDF documents.
106	Sub TestFormFields()
107		' Test if the add-on is installed
108		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form) Then
109			Console.WriteLine()
110			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.")
111			Console.WriteLine("---------------------------------------------------------------")
112			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
113			Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
114			Console.WriteLine("module, ensure that the SDK is able to find the required files")
115			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
116			Console.WriteLine()
117			Return
118		End If
119
120		Try
121			' Extract form fields as a JSON file
122			DataExtractionModule.ExtractData(input_path & "formfields-scanned.pdf", output_path & "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form)
123
124			' Extract form fields as a JSON string
125			Dim json As String = DataExtractionModule.ExtractData(input_path & "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form)
126			System.IO.File.WriteAllText(output_path & "formfields.json", json)
127
128			' Detect and add form fields to a PDF document.
129			' PDF document already has form fields, and this sample will update to new found fields.
130			Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
131				DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
132				doc.Save(output_path & "formfields-scanned-fields-new.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
133			End Using
134
135			' Detect and add form fields to a PDF document.
136			' PDF document already has form fields, and this sample will keep the original fields.
137			Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
138				Dim options = New DataExtractionOptions()
139				options.SetOverlappingFormFieldBehavior("KeepOld")
140				DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
141				doc.Save(output_path & "formfields-scanned-fields-old.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
142			End Using
143
144		Catch e As PDFNetException
145			Console.WriteLine(e.Message)
146		End Try
147
148	End Sub
149
150	' The following sample illustrates how to extract key-value pairs from PDF documents.
151	Sub TestGenericKeyValue()
152		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value) Then
153			Console.WriteLine()
154			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
155			Console.WriteLine("---------------------------------------------------------------")
156			Console.WriteLine("Thehttps://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
157			Console.WriteLine("module, ensure that the SDK is able to find the required files")
158			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
159			Console.WriteLine()
160			Return
161		End If
162
163		' Simple example: Extract Keys & Values as a JSON file
164		DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value)
165
166		' Example with customized options:
167		' Extract Keys & Values from pages 2-4, excluding ads
168		Dim options As New DataExtractionOptions()
169		options.SetPages("2-4")
170
171		Dim p2ExclusionZones As New RectCollection()
172		' Exclude the add-on on page 2
173		' These coordinates are in PDF user space, with the origin at the bottom left corner of the page
174		' Coordinates rotate with the page, if it has rotation applied.
175		p2ExclusionZones.AddRect(166, 47, 562, 222)
176		options.AddExclusionZonesForPage(p2ExclusionZones, 2)
177
178		Dim p4InclusionZones As New RectCollection()
179		Dim p4ExclusionZones As New RectCollection()
180		' Only include the article text for page 4, exclude ads and headings
181		p4InclusionZones.AddRect(30, 432, 562, 684)
182		p4ExclusionZones.AddRect(30, 657, 295, 684)
183		options.AddInclusionZonesForPage(p4InclusionZones, 4)
184		options.AddExclusionZonesForPage(p4ExclusionZones, 4)
185
186		DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val_with_zones.json",DataExtractionModule.DataExtractionEngine.e_generic_key_value, options)
187	End Sub
188
189	' The following sample illustrates how to extract document classes from PDF documents.
190	Sub TestDocClassifier()
191		' Test if the add-on is installed
192		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_classification) Then
193			Console.WriteLine()
194			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
195			Console.WriteLine("---------------------------------------------------------------")
196			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
197			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
198			Console.WriteLine("module, ensure that the SDK is able to find the required files")
199			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
200			Console.WriteLine()
201			Return
202		End If
203
204		Try
205			' Simple example: classify pages as a JSON file
206			DataExtractionModule.ExtractData(input_path & "Invoice.pdf", output_path & "Invoice_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification)
207
208			' Classify pages as a JSON string
209			Dim json As String = DataExtractionModule.ExtractData(input_path & "Scientific_Publication.pdf", DataExtractionModule.DataExtractionEngine.e_doc_classification)
210			System.IO.File.WriteAllText(output_path & "Scientific_Publication_Classified.json", json)
211
212			' Example with customized options:
213			Dim options As New DataExtractionOptions()
214			' Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
215			options.SetMinimumConfidenceThreshold(0.7)
216			DataExtractionModule.ExtractData(input_path & "Email.pdf", output_path & "Email_Classified.json",DataExtractionModule.DataExtractionEngine.e_doc_classification, options)
217
218		Catch e As PDFNetException
219			Console.WriteLine(e.Message)
220		End Try
221	End Sub
222
223End Module
224

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

Smart Data Extraction - Python Sample Code

Implementation steps