Smart Data Extraction

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

To run this sample, you will need to:

Get started with Server SDK in your language/framework
Download the Data Extraction Module

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7
8using pdftron;
9using pdftron.Common;
10using pdftron.PDF;
11using pdftron.SDF;
12using pdftron.Filters;
13
14namespace DataExtractionTestCS
15{
16	/// <summary>
17	///---------------------------------------------------------------------------------------
18	/// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
19	/// extract various types of data from PDF documents.
20	///
21	/// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
22	//---------------------------------------------------------------------------------------
23	/// </summary>
24	class Class1
25	{
26		private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
27		static Class1() { }
28
29		// Relative path to the folder containing test files.
30		static string input_path = "../../../../TestFiles/";
31		static string output_path = "../../../../TestFiles/Output/";
32
33
34		/// <summary>
35		/// The following sample illustrates how to extract tables from PDF documents.
36		/// </summary>
37		static void TestTabularData()
38		{
39			// Test if the add-on is installed
40			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
41			{
42				Console.WriteLine();
43				Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
44				Console.WriteLine("---------------------------------------------------------------");
45				Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
46				Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module .  If you have already downloaded this");
47				Console.WriteLine("module, ensure that the SDK is able to find the required files");
48				Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49				Console.WriteLine();
50				return;
51			}
52
53			try
54			{
55				// Extract tabular data as a JSON file
56				DataExtractionModule.ExtractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
57
58				// Extract tabular data as a JSON string
59				string json = DataExtractionModule.ExtractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
60				System.IO.File.WriteAllText(output_path + "financial.json", json);
61
62				// Extract tabular data as an XLSX file
63				DataExtractionModule.ExtractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
64
65				// Extract tabular data as an XLSX stream (also known as filter)
66				MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
67				DataExtractionModule.ExtractToXLSX(input_path + "financial.pdf", output_xlsx_stream);
68				output_xlsx_stream.SetAsInputFilter();
69				output_xlsx_stream.WriteToFile(output_path + "financial.xlsx", false);
70			}
71			catch (PDFNetException e)
72			{
73				Console.WriteLine(e.Message);
74			}
75		}
76
77
78		/// <summary>
79		// The following sample illustrates how to extract document structure from PDF documents.
80		/// </summary>
81		static void TestDocumentStructure()
82		{
83			// Test if the add-on is installed
84			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
85			{
86				Console.WriteLine();
87				Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
88				Console.WriteLine("---------------------------------------------------------------");
89				Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
90				Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
91				Console.WriteLine("module, ensure that the SDK is able to find the required files");
92				Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
93				Console.WriteLine();
94				return;
95			}
96
97			try
98			{
99				// Extract document structure as a JSON file
100				DataExtractionModule.ExtractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
101
102				// Extract document structure as a JSON string
103				string json = DataExtractionModule.ExtractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
104				System.IO.File.WriteAllText(output_path + "tagged.json", json);
105			}
106			catch (PDFNetException e)
107			{
108				Console.WriteLine(e.Message);
109			}
110		}
111
112
113		/// <summary>
114		// The following sample illustrates how to extract form fields from PDF documents.
115		/// </summary>
116		static void TestFormFields()
117		{
118			// Test if the add-on is installed
119			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
120			{
121				Console.WriteLine();
122				Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
123				Console.WriteLine("---------------------------------------------------------------");
124				Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
125				Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
126				Console.WriteLine("module, ensure that the SDK is able to find the required files");
127				Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
128				Console.WriteLine();
129				return;
130			}
131
132			try
133			{
134				// Extract form fields as a JSON file
135				DataExtractionModule.ExtractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
136
137				// Extract form fields as a JSON string
138				string json = DataExtractionModule.ExtractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
139				System.IO.File.WriteAllText(output_path + "formfields.json", json);
140
141				// Detect and add form fields to a PDF document.
142				// PDF document already has form fields, and this sample will update to new found fields.
143				using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
144				{
145					DataExtractionModule.DetectAndAddFormFieldsToPDF(doc);
146					doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveOptions.e_linearized);
147				}
148
149				// Detect and add form fields to a PDF document.
150				// PDF document already has form fields, and this sample will keep the original fields.
151				using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
152				{
153					DataExtractionOptions options = new DataExtractionOptions();
154					options.SetOverlappingFormFieldBehavior("KeepOld");
155
156					DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options);
157					doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveOptions.e_linearized);
158				}
159			}
160			catch (PDFNetException e)
161			{
162				Console.WriteLine(e.Message);
163			}
164		}
165
166		/// <summary>
167		// The following sample illustrates how to extract document structure from PDF documents.
168		/// </summary>
169		static void TestGenericKeyValue()
170		{
171			if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value))
172				{
173					Console.WriteLine();
174					Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
175					Console.WriteLine("---------------------------------------------------------------");
176					Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
177					Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
178					Console.WriteLine("module, ensure that the SDK is able to find the required files");
179					Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
180					Console.WriteLine();
181					return;
182				}
183
184			try 
185			{				
186				// Simple example: Extract Keys & Values as a JSON file
187				DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
188
189				// Example with customized options:
190				// Extract Keys & Values from pages 2-4, excluding ads
191				DataExtractionOptions options = new DataExtractionOptions();
192				options.SetPages("2-4");
193
194				RectCollection p2ExclusionZones = new RectCollection();
195				// Exclude the ad on page 2
196				// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
197				// Coordinates rotate with the page, if it has rotation applied.
198				p2ExclusionZones.AddRect(166, 47, 562, 222);
199				options.AddExclusionZonesForPage(p2ExclusionZones, 2);
200
201				RectCollection p4InclusionZones = new RectCollection();
202				RectCollection p4ExclusionZones = new RectCollection();
203				// Only include the article text for page 4, exclude ads and headings
204				p4InclusionZones.AddRect(30, 432, 562, 684);
205				p4ExclusionZones.AddRect(30, 657, 295, 684);
206				options.AddInclusionZonesForPage(p4InclusionZones, 4);
207				options.AddExclusionZonesForPage(p4ExclusionZones, 4);
208
209				DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
210			}
211			catch (PDFNetException e)
212			{
213				Console.WriteLine(e.Message);
214			}
215		}
216
217
218
219		/// <summary>
220		/// The main entry point for the application.
221		/// </summary>
222		static void Main(string[] args)
223		{
224			// The first step in every application using PDFNet is to initialize the 
225			// library and set the path to common PDF resources. The library is usually 
226			// initialized only once, but calling Initialize() multiple times is also fine.
227			PDFNet.Initialize(PDFTronLicense.Key);
228			PDFNet.AddResourceSearchPath("../../../../../Lib/");
229
230			TestTabularData();
231			TestDocumentStructure();
232			TestFormFields();
233			TestGenericKeyValue();
234
235			PDFNet.Terminate();
236		}
237	}
238}
239

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30	ofstream out_file(filename.c_str(), ofstream::binary);
31	string out_buf = text.ConvertToUtf8();
32	out_file.write(out_buf.c_str(), out_buf.size());
33	out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45	// Test if the add-on is installed
46	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47	{
48		cout << endl;
49		cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50		cout << "---------------------------------------------------------------" << endl;
51		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52		cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
53		cout << "module, ensure that the SDK is able to find the required files" << endl;
54		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55		return;
56	}
57
58	// Extract tabular data as a JSON file
59	DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61	// Extract tabular data as a JSON string
62	UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63	WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65	// Extract tabular data as an XLSX file
66	DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68	// Extract tabular data as an XLSX stream (also known as filter)
69	MemoryFilter output_xlsx_stream(0, false);
70	DataExtractionOptions options;
71	options.SetPages("1"); // extract page 1
72	DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73	output_xlsx_stream.SetAsInputFilter();
74	output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82	// Test if the add-on is installed
83	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84	{
85		cout << endl;
86		cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87		cout << "---------------------------------------------------------------" << endl;
88		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89		cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
90		cout << "module, ensure that the SDK is able to find the required files" << endl;
91		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92		return;
93	}
94
95	// Extract document structure as a JSON file
96	DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98	// Extract document structure as a JSON string
99	UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100	WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108	// Test if the add-on is installed
109	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110	{
111		cout << endl;
112		cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113		cout << "---------------------------------------------------------------" << endl;
114		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115		cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
116		cout << "module, ensure that the SDK is able to find the required files" << endl;
117		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118		return;
119	}
120
121	// Extract form fields as a JSON file
122	DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124	// Extract form fields as a JSON string
125	UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126	WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128	//---------------------------------------------------------------------------------------
129	// Detect and add form fields to a PDF document.
130	// PDF document already has form fields, and this sample will update to new found fields.
131	//---------------------------------------------------------------------------------------
132	{
133		PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135		DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137		// Save the modfied pdf document
138		doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139	}
140
141	//---------------------------------------------------------------------------------------
142	// Detect and add form fields to a PDF document.
143	// PDF document already has form fields, and this sample will keep the original fields.
144	//---------------------------------------------------------------------------------------
145	{
146		PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148		// Setup DataExtractionOptions to keep old fields
149		DataExtractionOptions options;
150		options.SetOverlappingFormFieldBehavior("KeepOld");
151
152		DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154		// Save the modfied pdf document
155		doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156	}
157}
158
159//---------------------------------------------------------------------------------------
160// The following sample illustrates how to extract key-value pairs from PDF documents.
161//---------------------------------------------------------------------------------------
162void TestGenericKeyValue() {
163
164	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue))
165	{
166		cout << endl;
167		cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
168		cout << "---------------------------------------------------------------" << endl;
169		cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
170		cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
171		cout << "module, ensure that the SDK is able to find the required files" << endl;
172		cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
173		return;
174	}
175
176	// Simple example: Extract Keys & Values as a JSON file
177	DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val.json"), DataExtractionModule::e_GenericKeyValue);
178
179	// Example with customized options:
180	// Extract Keys & Values from pages 2-4, excluding ads
181	DataExtractionOptions options;
182	options.SetPages("2-4");
183	RectCollection p2_exclusion_zones;
184	// Exclude the ad on page 2
185	// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
186	// Coordinates rotate with the page, if it has rotation applied.
187	p2_exclusion_zones.AddRect(166, 47, 562, 222);
188	options.AddExclusionZonesForPage(p2_exclusion_zones, 2);
189
190	RectCollection p4_inclusion_zones, p4_exclusion_zones;
191	// Only include the article text for page 4, exclude ads and headings
192	p4_inclusion_zones.AddRect(30, 432, 562, 684);
193	p4_exclusion_zones.AddRect(30, 657, 295, 684);
194	options.AddInclusionZonesForPage(p4_inclusion_zones, 4);
195	options.AddExclusionZonesForPage(p4_exclusion_zones, 4);
196
197	DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val_with_zones.json"), DataExtractionModule::e_GenericKeyValue, &options);
198}
199
200int main(int argc, char* argv[])
201{
202	// The first step in every application using PDFNet is to initialize the 
203	// library and set the path to common PDF resources. The library is usually 
204	// initialized only once, but calling Initialize() multiple times is also fine.
205	PDFNet::Initialize(LicenseKey);
206
207	int ret = 0;
208
209	try
210	{
211		PDFNet::AddResourceSearchPath("../../../Lib/");
212
213		TestTabularData();
214		TestDocumentStructure();
215		TestFormFields();
216		TestGenericKeyValue();
217	}
218	catch (Common::Exception& e)
219	{
220		cout << e << endl;
221		ret = 1;
222	}
223	catch (...)
224	{
225		cout << "Unknown Exception" << endl;
226		ret = 1;
227	}
228
229	PDFNet::Terminate();
230
231	return ret;
232}
233

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8	"fmt"
9	"testing"
10	"os"
11	"flag"
12	. "github.com/pdftron/pdftron-go/v2"
13)
14
15var licenseKey string
16var modulePath string
17
18func init() {
19    flag.StringVar(&licenseKey, "license", "", "License key for Apryse SDK")
20    flag.StringVar(&modulePath, "modulePath", "", "Path for downloaded modules")
21}
22
23//---------------------------------------------------------------------------------------
24// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
25// extract various types of data from PDF documents.
26//
27// The Apryse SDK Data Extraction suite can be downloaded from
28// https://docs.apryse.com/core/guides/info/modules
29//
30// Please contact us if you have any questions.
31//---------------------------------------------------------------------------------------
32
33// Relative path to the folder containing the test files.
34var inputPath = "../TestFiles/"
35var outputPath = "../TestFiles/Output/"
36
37//---------------------------------------------------------------------------------------
38
39func catch(err *error) {
40	if r := recover(); r != nil {
41		*err = fmt.Errorf("%v", r)
42	}
43}
44
45//---------------------------------------------------------------------------------------
46
47func WriteTextToFile(outputFile string, text string) {
48	f, err := os.Create(outputFile)
49	if err != nil {
50		fmt.Println(err)
51	}
52
53	defer f.Close()
54
55	_, err2 := f.WriteString(text)
56	if err2 != nil {
57		fmt.Println(err2)
58	}
59}
60
61//---------------------------------------------------------------------------------------
62// The following sample illustrates how to extract tables from PDF documents.
63//---------------------------------------------------------------------------------------
64
65func TabularDataTest() (err error) {
66	defer catch(&err)
67
68    PDFNetAddResourceSearchPath(modulePath)
69
70	// Test if the add-on is installed
71	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Tabular) {
72		fmt.Println("")
73		fmt.Println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
74		fmt.Println("-----------------------------------------------------------------------------")
75		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
76		fmt.Println("at https://docs.apryse.com/documentation/core/guides/info/modules. If you have already")
77		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
78		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
79		fmt.Println("")
80		return nil
81	}
82
83	// Extract tabular data as a JSON file
84	fmt.Println("Extract tabular data as a JSON file")
85
86	inputFile := inputPath + "table.pdf"
87	outputFile := outputPath + "table.json"
88	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Tabular)
89
90	fmt.Println("Result saved in " + outputFile)
91
92	// Extract tabular data as a JSON string
93	fmt.Println("Extract tabular data as a JSON string")
94
95	inputFile = inputPath + "financial.pdf"
96	outputFile = outputPath + "financial.json"
97
98	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Tabular).(string)
99	WriteTextToFile(outputFile, json)
100
101	fmt.Println("Result saved in " + outputFile)
102
103	// Extract tabular data as an XLSX file
104	fmt.Println("Extract tabular data as an XLSX file")
105
106	inputFile = inputPath + "table.pdf"
107	outputFile = outputPath + "table.xlsx"
108	DataExtractionModuleExtractToXLSX(inputFile, outputFile)
109
110	fmt.Println("Result saved in " + outputFile)
111
112	// Extract tabular data as an XLSX stream (also known as filter)
113	fmt.Println("Extract tabular data as an XLSX stream")
114
115	inputFile = inputPath + "financial.pdf"
116	outputFile = outputPath + "financial.xlsx"
117	outputXlsxStream := NewMemoryFilter(0, false)
118	outputFilter := NewFilter(outputXlsxStream)
119	options := NewDataExtractionOptions()
120	options.SetPages("1"); // page 1
121	DataExtractionModuleExtractToXLSX(inputFile, outputFilter, options)
122	outputXlsxStream.SetAsInputFilter()
123	outputXlsxStream.WriteToFile(outputFile, false)
124
125	fmt.Println("Result saved in " + outputFile)
126
127	return nil
128}
129
130//---------------------------------------------------------------------------------------
131// The following sample illustrates how to extract document structure from PDF documents.
132//---------------------------------------------------------------------------------------
133
134func DocumentStructureTest() (err error) {
135	defer catch(&err)
136
137	// Test if the add-on is installed
138	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocStructure) {
139		fmt.Println("")
140		fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
141		fmt.Println("-----------------------------------------------------------------------------")
142		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
143		fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
144		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
145		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
146		fmt.Println("")
147		return nil
148	}
149
150	// Extract document structure as a JSON file
151	fmt.Println("Extract document structure as a JSON file")
152
153	inputFile := inputPath + "paragraphs_and_tables.pdf"
154	outputFile := outputPath + "paragraphs_and_tables.json"
155	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocStructure)
156
157	fmt.Println("Result saved in " + outputFile)
158
159	// Extract document structure as a JSON string
160	fmt.Println("Extract document structure as a JSON string")
161
162	inputFile = inputPath + "tagged.pdf"
163	outputFile = outputPath + "tagged.json"
164	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocStructure).(string)
165	WriteTextToFile(outputFile, json)
166
167	fmt.Println("Result saved in " + outputFile)
168
169	return nil
170}
171
172//---------------------------------------------------------------------------------------
173// The following sample illustrates how to extract form fields from PDF documents.
174//---------------------------------------------------------------------------------------
175
176func FormFieldsTest() (err error) {
177	defer catch(&err)
178
179	// Test if the add-on is installed
180	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Form) {
181		fmt.Println("")
182		fmt.Println("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
183		fmt.Println("-----------------------------------------------------------------------------")
184		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
185		fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
186		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
187		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
188		fmt.Println("")
189		return nil
190	}
191
192	// Extract form fields as a JSON file
193	fmt.Println("Extract form fields as a JSON file")
194
195	inputFile := inputPath + "formfields-scanned.pdf"
196	outputFile := outputPath + "formfields-scanned.json"
197	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Form)
198
199	fmt.Println("Result saved in " + outputFile)
200
201	// Extract form fields as a JSON string
202	fmt.Println("Extract form fields as a JSON string")
203
204	inputFile = inputPath + "formfields.pdf"
205	outputFile = outputPath + "formfields.json"
206
207	json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Form).(string)
208	WriteTextToFile(outputFile, json)
209
210	fmt.Println("Result saved in " + outputFile)
211
212	//////////////////////////////////////////////////////////////////////////
213	// Detect and add form fields to a PDF document.
214	// PDF document already has form fields, and this sample will update to new found fields.
215	doc := NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
216
217	fmt.Println("Extract form fields as a PDF file, keep new fields")
218	DataExtractionModuleDetectAndAddFormFieldsToPDF(doc)
219
220	outputFile = outputPath + "formfields-scanned-fields-new.pdf"
221	doc.Save(outputFile, uint(SDFDocE_linearized))
222	doc.Close()
223
224	fmt.Println("Result saved in " + outputFile)
225
226	//////////////////////////////////////////////////////////////////////////
227	// Detect and add form fields to a PDF document.
228	// PDF document already has form fields, and this sample will keep the original fields.
229	doc = NewPDFDoc(inputPath + "formfields-scanned-withfields.pdf")
230
231	// Setup DataExtractionOptions to keep old fields
232	options := NewDataExtractionOptions()
233	options.SetOverlappingFormFieldBehavior("KeepOld")
234
235	fmt.Println("Extract form fields as a PDF file, keep old fields")
236	DataExtractionModuleDetectAndAddFormFieldsToPDF(doc, options)
237
238	outputFile = outputPath + "formfields-scanned-fields-old.pdf"
239	doc.Save(outputFile, uint(SDFDocE_linearized))
240	doc.Close()
241
242	fmt.Println("Result saved in " + outputFile)
243
244	return nil
245}
246
247//---------------------------------------------------------------------------------------
248// The following sample illustrates how to extract key-value pairs from PDF documents.
249//---------------------------------------------------------------------------------------
250
251func GenericKeyValueTest() (err error) {
252	defer catch(&err)
253
254	// Test if the add-on is installed
255	if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_GenericKeyValue) {
256		fmt.Println("")
257		fmt.Println("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.")
258		fmt.Println("-----------------------------------------------------------------------------")
259		fmt.Println("The Data Extraction suite is an optional add-on, available for download")
260		fmt.Println("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
261		fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
262		fmt.Println("using the PDFNetAddResourceSearchPath() function.")
263		fmt.Println("")
264		return nil
265	}
266
267	fmt.Println("Extract key-value pairs from a PDF")
268
269	inputFile := inputPath + "newsletter.pdf"
270	outputFile := outputPath + "newsletter_key_val.json"
271	// Simple example: Extract Keys & Values as a JSON file
272	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue)
273
274	fmt.Println("Result saved in " + outputFile)
275
276	// Example with customized options:
277	// Extract Keys & Values from pages 2-4, excluding ads
278	options := NewDataExtractionOptions()
279	options.SetPages("2-4")
280	
281	p2ExclusionZones := NewRectCollection()
282	// Exclude the ad on page 2
283	// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
284	// Coordinates rotate with the page, if it has rotation applied.
285	p2ExclusionZones.AddRect(NewRect(166, 47, 562, 222))
286	options.AddExclusionZonesForPage(p2ExclusionZones, 2)
287
288	p4InclusionZones := NewRectCollection()
289	p4ExclusionZones := NewRectCollection()
290	// Only include the article text for page 4, exclude ads and headings
291	p4InclusionZones.AddRect(NewRect(30, 432, 562, 684))
292	p4ExclusionZones.AddRect(NewRect(30, 657, 295, 684))
293	options.AddInclusionZonesForPage(p4InclusionZones, 4)
294	options.AddExclusionZonesForPage(p4ExclusionZones, 4)
295	
296	fmt.Println("Extract Key-Value pairs from specific pages and zones as a JSON file")
297	outputFile = outputPath + "newsletter_key_val_with_zones.json"
298	DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_GenericKeyValue, options)
299
300	fmt.Println("Result saved in " + outputFile)
301
302	return nil
303}
304
305//---------------------------------------------------------------------------------------
306
307func TestDataExtraction(t *testing.T) {
308	// The first step in every application using PDFNet is to initialize the 
309	// library. The library is usually initialized only once, but calling 
310	// Initialize() multiple times is also fine.
311	PDFNetInitialize(licenseKey)
312
313	//-----------------------------------------------------------------------------------
314
315	PDFNetAddResourceSearchPath("../../../PDFNetC/Lib/")
316
317	//-----------------------------------------------------------------------------------
318
319	err := TabularDataTest()
320	if err != nil {
321		fmt.Println(fmt.Errorf("Unable to extract tabular data, error: %s", err))
322	}
323
324	//-----------------------------------------------------------------------------------
325
326	err = DocumentStructureTest()
327	if err != nil {
328		fmt.Println(fmt.Errorf("Unable to extract document structure data, error: %s", err))
329	}
330
331	//-----------------------------------------------------------------------------------
332
333	err = FormFieldsTest()
334	if err != nil {
335		fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
336	}
337
338	err = GenericKeyValueTest()
339	if err != nil {
340		fmt.Println(fmt.Errorf("Unable to extract key-value pairs, error: %s", err))
341	}
342
343	//-----------------------------------------------------------------------------------
344
345	PDFNetTerminate()
346	fmt.Println("Done.")
347}
348

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.io.FileWriter;
7import java.io.BufferedWriter;
8import java.io.FileNotFoundException;
9import java.io.IOException;
10
11import com.pdftron.common.PDFNetException;
12import com.pdftron.pdf.*;
13import com.pdftron.filters.*;
14import com.pdftron.sdf.SDFDoc;
15
16//---------------------------------------------------------------------------------------
17// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18// extract various types of data from PDF documents.
19//
20// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
21//---------------------------------------------------------------------------------------
22
23public class DataExtractionTest {
24
25	static void writeTextToFile(String filename, String text) throws IOException
26	{
27		BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
28		writer.write(text);
29		writer.close();
30	}
31
32	//---------------------------------------------------------------------------------------
33	// The following sample illustrates how to extract tables from PDF documents.
34	//---------------------------------------------------------------------------------------
35	static void testTabularData()
36	{
37		try {
38			// Test if the add-on is installed
39			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
40			{
41				System.out.println();
42				System.out.println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
43				System.out.println("---------------------------------------------------------------");
44				System.out.println("The Data Extraction suite is an optional add-on, available for download");
45				System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
46				System.out.println("module, ensure that the SDK is able to find the required files");
47				System.out.println("using the PDFNet.addResourceSearchPath() function." );
48				System.out.println();
49				return;
50			}
51		} catch (PDFNetException e) {
52			System.out.println("Data Extraction module not available, error:");
53			e.printStackTrace();
54			System.out.println(e);
55		}
56
57		// Relative path to the folder containing test files.
58		String input_path = "../../TestFiles/";
59		String output_path = "../../TestFiles/Output/";
60
61		try {
62			// Extract tabular data as a JSON file
63			DataExtractionModule.extractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
64
65			// Extract tabular data as a JSON string
66			String json = DataExtractionModule.extractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
67			writeTextToFile(output_path + "financial.json", json);
68
69			// Extract tabular data as an XLSX file
70			DataExtractionModule.extractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
71
72			// Extract tabular data as an XLSX stream (also known as filter)
73			DataExtractionOptions options = new DataExtractionOptions();
74			options.setPages("1");
75			MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
76			DataExtractionModule.extractToXLSX(input_path + "financial.pdf", output_xlsx_stream, options);
77			output_xlsx_stream.setAsInputFilter();
78			output_xlsx_stream.writeToFile(output_path + "financial.xlsx", false);
79
80		} catch (PDFNetException e) {
81			System.out.println(e);
82		}
83		catch (IOException e) {
84			System.out.println(e);
85		}
86	}
87
88	//---------------------------------------------------------------------------------------
89	// The following sample illustrates how to extract document structure from PDF documents.
90	//---------------------------------------------------------------------------------------
91	static void testDocumentStructure()
92	{
93		// Test if the add-on is installed
94		try {
95			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
96			{
97				System.out.println();
98				System.out.println("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
99				System.out.println("---------------------------------------------------------------");
100				System.out.println("The Data Extraction suite is an optional add-on, available for download");
101				System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
102				System.out.println("module, ensure that the SDK is able to find the required files");
103				System.out.println("using the PDFNet.addResourceSearchPath() function." );
104				System.out.println();
105				return;
106			}
107		} catch (PDFNetException e) {
108			System.out.println("Data Extraction module not available, error:");
109			e.printStackTrace();
110			System.out.println(e);
111		}
112
113		// Relative path to the folder containing test files.
114		String input_path = "../../TestFiles/";
115		String output_path = "../../TestFiles/Output/";
116
117		try {
118			// Extract document structure as a JSON file
119			DataExtractionModule.extractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
120
121			// Extract document structure as a JSON string
122			String json = DataExtractionModule.extractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
123			writeTextToFile(output_path + "tagged.json", json);
124
125		} catch (PDFNetException e) {
126			System.out.println(e);
127		}
128		catch (IOException e) {
129			System.out.println(e);
130		}
131	}
132
133	//---------------------------------------------------------------------------------------
134	// The following sample illustrates how to extract form fields from PDF documents.
135	//---------------------------------------------------------------------------------------
136	static void testFormFields()
137	{
138		try {
139			// Test if the add-on is installed
140			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
141			{
142				System.out.println();
143				System.out.println("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
144				System.out.println("---------------------------------------------------------------");
145				System.out.println("The Data Extraction suite is an optional add-on, available for download");
146				System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
147				System.out.println("module, ensure that the SDK is able to find the required files");
148				System.out.println("using the PDFNet.addResourceSearchPath() function." );
149				System.out.println();
150				return;
151			}
152		} catch (PDFNetException e) {
153			System.out.println("Data Extraction module not available, error:");
154			e.printStackTrace();
155			System.out.println(e);
156		}
157
158		// Relative path to the folder containing test files.
159		String input_path = "../../TestFiles/";
160		String output_path = "../../TestFiles/Output/";
161
162		try {
163			// Extract form fields as a JSON file
164			DataExtractionModule.extractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
165
166			// Extract form fields as a JSON string
167			String json = DataExtractionModule.extractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
168			writeTextToFile(output_path + "formfields.json", json);
169
170			//---------------------------------------------------------------------------------------
171			// Detect and add form fields to a PDF document.
172			// PDF document already has form fields, and this sample will update to new found fields.
173			//---------------------------------------------------------------------------------------
174			try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
175			{
176				DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
177
178				// Save the modfied pdf document
179				doc.save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveMode.LINEARIZED, null);
180			} catch (Exception e) {
181				e.printStackTrace();
182			}
183
184			//---------------------------------------------------------------------------------------
185			// Detect and add form fields to a PDF document.
186			// PDF document already has form fields, and this sample will keep the original fields.
187			//---------------------------------------------------------------------------------------
188			try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
189			{
190				// Setup DataExtractionOptions to keep old fields
191				DataExtractionOptions options = new DataExtractionOptions();
192				options.setOverlappingFormFieldBehavior("KeepOld");
193
194				DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
195
196				// Save the modfied pdf document
197				doc.save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198			} catch (Exception e) {
199				e.printStackTrace();
200			}
201
202		} catch (PDFNetException e) {
203			System.out.println(e);
204		}
205		catch (IOException e) {
206			System.out.println(e);
207		}
208	}
209
210	//---------------------------------------------------------------------------------------
211	// The following sample illustrates how to extract key-value pairs from PDF documents.
212	//---------------------------------------------------------------------------------------
213	public static void testGenericKeyValue() {
214		try {
215			// Test if the add-on is installed
216			if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
217			{
218				System.out.println();
219				System.out.println("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
220				System.out.println("---------------------------------------------------------------");
221				System.out.println("The Data Extraction suite is an optional add-on, available for download");
222				System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
223				System.out.println("module, ensure that the SDK is able to find the required files");
224				System.out.println("using the PDFNet.addResourceSearchPath() function." );
225				System.out.println();
226				return;
227			}
228		} catch (PDFNetException e) {
229			System.out.println("Data Extraction module not available, error:");
230			e.printStackTrace();
231			System.out.println(e);
232		}
233
234		// Relative path to the folder containing test files.
235		String input_path = "../../TestFiles/";
236		String output_path = "../../TestFiles/Output/";
237
238		try {
239
240			// Simple example: Extract Keys & Values as a JSON file
241			DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
242
243			// Example with customized options:
244			// Extract Keys & Values from pages 2-4, excluding ads
245			DataExtractionOptions options = new DataExtractionOptions();
246			options.setPages("2-4");
247
248			RectCollection p2ExclusionZones = new RectCollection();
249			// Exclude the ad on page 2
250			// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
251			// Coordinates rotate with the page, if it has rotation applied.
252			p2ExclusionZones.addRect(166, 47, 562, 222);
253			options.addExclusionZonesForPage(p2ExclusionZones, 2);
254
255			RectCollection p4InclusionZones = new RectCollection();
256			RectCollection p4ExclusionZones = new RectCollection();
257			// Only include the article text for page 4, exclude ads and headings
258			p4InclusionZones.addRect(30, 432, 562, 684);
259			p4ExclusionZones.addRect(30, 657, 295, 684);
260			options.addInclusionZonesForPage(p4InclusionZones, 4);
261			options.addExclusionZonesForPage(p4ExclusionZones, 4);
262
263			DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
264
265		} catch (Exception e) {
266			System.out.println(e);
267		}        
268    }
269
270	public static void main(String[] args)
271	{
272		// The first step in every application using PDFNet is to initialize the 
273		// library and set the path to common PDF resources. The library is usually 
274		// initialized only once, but calling initialize() multiple times is also fine.
275		PDFNet.initialize(PDFTronLicense.Key());
276		PDFNet.addResourceSearchPath("../../../Lib/");
277
278		testTabularData();
279		testDocumentStructure();
280		testFormFields();
281		testGenericKeyValue();
282
283		PDFNet.terminate();
284	}
285}
286

1 <?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10//---------------------------------------------------------------------------------------
11// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
12// extract various types of data from PDF documents.
13//
14// The Apryse SDK Data Extraction suite can be downloaded from
15// https://docs.apryse.com/core/guides/info/modules
16//
17// Please contact us if you have any questions.
18//---------------------------------------------------------------------------------------
19
20function WriteTextToFile($outputFile, $text)
21{
22	$outfile = fopen($outputFile, "w");
23	fwrite($outfile, $text);
24	fclose($outfile);
25}
26
27function main()
28{
29	// Relative path to the folder containing the test files.
30	$inputPath = getcwd()."/../../TestFiles/";
31	$outputPath = $inputPath."Output/";
32
33	// The first step in every application using PDFNet is to initialize the 
34	// library. The library is usually initialized only once, but calling 
35	// Initialize() multiple times is also fine.
36	global $LicenseKey;
37	PDFNet::Initialize($LicenseKey);
38	PDFNet::GetSystemFontList();    // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
39	
40	//-----------------------------------------------------------------------------------
41
42	PDFNet::AddResourceSearchPath("../../../PDFNetC/Lib/");
43
44	//////////////////////////////////////////////////////////////////////////
45	// The following sample illustrates how to extract tables from PDF documents.
46	//////////////////////////////////////////////////////////////////////////
47
48	// Test if the add-on is installed
49	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular)) {
50		echo(nl2br("\n"));
51		echo(nl2br("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.\n"));
52		echo(nl2br("-----------------------------------------------------------------------------\n"));
53		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
54		echo(nl2br("at https://docs.apryse.com/core/guides/info/modules. If you have already\n"));
55		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
56		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
57		echo(nl2br("\n"));
58	}
59	else {
60		try {
61			// Extract tabular data as a JSON file
62			echo(nl2br("Extract tabular data as a JSON file\n"));
63
64			$outputFile = $outputPath."table.json";
65			DataExtractionModule::ExtractData($inputPath."table.pdf", $outputFile, DataExtractionModule::e_Tabular);
66
67			echo(nl2br("Result saved in " . $outputFile . "\n"));
68
69			///////////////////////////////////////////////////////
70			// Extract tabular data as a JSON string
71			echo(nl2br("Extract tabular data as a JSON string\n"));
72
73			$outputFile = $outputPath."financial.json";
74			$json = DataExtractionModule::ExtractData($inputPath."financial.pdf", DataExtractionModule::e_Tabular);
75			WriteTextToFile($outputFile, $json);
76
77			echo(nl2br("Result saved in " . $outputFile . "\n"));
78
79			///////////////////////////////////////////////////////
80			// Extract tabular data as an XLSX file
81			echo(nl2br("Extract tabular data as an XLSX file\n"));
82
83			$outputFile = $outputPath."table.xlsx";
84			DataExtractionModule::ExtractToXLSX($inputPath."table.pdf", $outputFile);
85
86			echo(nl2br("Result saved in " . $outputFile . "\n"));
87
88			///////////////////////////////////////////////////////
89			// Extract tabular data as an XLSX stream (also known as filter)
90			echo(nl2br("Extract tabular data as an XLSX stream\n"));
91
92			$outputFile = $outputPath."financial.xlsx";
93			$outputXlsxStream = new MemoryFilter(0, false);
94			$options = new DataExtractionOptions();
95			$options->SetPages("1"); // page 1
96			DataExtractionModule::ExtractToXLSX($inputPath."financial.pdf", $outputXlsxStream, $options);
97			$outputXlsxStream->SetAsInputFilter();
98			$outputXlsxStream->WriteToFile($outputFile, false);
99
100			echo(nl2br("Result saved in " . $outputFile . "\n"));
101		}
102		catch(Exception $e) {
103			echo(nl2br("Unable to extract tabular data, error: " . $e->getMessage() . "\n"));
104		}
105	}
106
107	//////////////////////////////////////////////////////////////////////////
108	// The following sample illustrates how to extract document structure from PDF documents.
109	//////////////////////////////////////////////////////////////////////////
110
111	// Test if the add-on is installed
112	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure)) {
113		echo(nl2br("\n"));
114		echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
115		echo(nl2br("-----------------------------------------------------------------------------\n"));
116		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
117		echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
118		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
119		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
120		echo(nl2br("\n"));
121	}
122	else {
123		try {
124			// Extract document structure as a JSON file
125			echo(nl2br("Extract document structure as a JSON file\n"));
126
127			$outputFile = $outputPath."paragraphs_and_tables.json";
128			DataExtractionModule::ExtractData($inputPath."paragraphs_and_tables.pdf", $outputFile, DataExtractionModule::e_DocStructure);
129
130			echo(nl2br("Result saved in " . $outputFile . "\n"));
131
132			///////////////////////////////////////////////////////
133			// Extract document structure as a JSON string
134			echo(nl2br("Extract document structure as a JSON string\n"));
135
136			$outputFile = $outputPath."tagged.json";
137			$json = DataExtractionModule::ExtractData($inputPath."tagged.pdf", DataExtractionModule::e_DocStructure);
138			WriteTextToFile($outputFile, $json);
139
140			echo(nl2br("Result saved in " . $outputFile . "\n"));
141		}
142		catch(Exception $e) {
143			echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
144		}
145	}
146
147	//////////////////////////////////////////////////////////////////////////
148	// The following sample illustrates how to extract form fields from PDF documents.
149	//////////////////////////////////////////////////////////////////////////
150
151	// Test if the add-on is installed
152	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form)) {
153		echo(nl2br("\n"));
154		echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.\n"));
155		echo(nl2br("-----------------------------------------------------------------------------\n"));
156		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
157		echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
158		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
159		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
160		echo(nl2br("\n"));
161	}
162	else {
163		try {
164			// Extract form fields as a JSON file
165			echo(nl2br("Extract form fields as a JSON file\n"));
166
167			$outputFile = $outputPath."formfields-scanned.json";
168			DataExtractionModule::ExtractData($inputPath."formfields-scanned.pdf", $outputFile, DataExtractionModule::e_Form);
169
170			echo(nl2br("Result saved in " . $outputFile . "\n"));
171
172			///////////////////////////////////////////////////////
173			// Extract form fields as a JSON string
174			echo(nl2br("Extract form fields as a JSON string\n"));
175
176			$outputFile = $outputPath."formfields.json";
177			$json = DataExtractionModule::ExtractData($inputPath."formfields.pdf", DataExtractionModule::e_Form);
178			WriteTextToFile($outputFile, $json);
179
180			echo(nl2br("Result saved in " . $outputFile . "\n"));
181
182			///////////////////////////////////////////////////////
183			// Detect and add form fields to a PDF document.
184			// PDF document already has form fields, and this sample will update to new found fields.
185			echo(nl2br("Extract form fields as a PDF file\n"));
186
187			$doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
188			DataExtractionModule::DetectAndAddFormFieldsToPDF($doc);
189			$doc->Save($outputPath."formfields-scanned-fields-new.pdf", SDFDoc::e_linearized);
190			$doc->Close();
191
192			echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-new.pdf" . "\n"));
193
194			///////////////////////////////////////////////////////
195			// Detect and add form fields to a PDF document.
196			// PDF document already has form fields, and this sample will keep the original fields.
197			echo(nl2br("Extract form fields as a PDF file\n"));
198			
199			$doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
200			$options = new DataExtractionOptions();
201			$options->SetOverlappingFormFieldBehavior("KeepOld");
202			DataExtractionModule::DetectAndAddFormFieldsToPDF($doc, $options);
203			$doc->Save($outputPath."formfields-scanned-fields-old.pdf", SDFDoc::e_linearized);
204			$doc->Close();
205
206			echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-old.pdf" . "\n"));
207
208		}
209		catch(Exception $e) {
210			echo(nl2br("Unable to extract form fields data, error: " . $e->getMessage() . "\n"));
211		}
212	}
213
214	//////////////////////////////////////////////////////////////////////////
215	// The following sample illustrates how to extract document structure from PDF documents.
216	//////////////////////////////////////////////////////////////////////////
217
218	// Test if the add-on is installed
219	if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue)) {
220		echo(nl2br("\n"));
221		echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.\n"));
222		echo(nl2br("-----------------------------------------------------------------------------\n"));
223		echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
224		echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
225		echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
226		echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
227		echo(nl2br("\n"));
228	}
229	else {
230		try {
231			
232			echo(nl2br("Extract key-value pairs from a PDF\n"));
233			// Simple example: Extract Keys & Values as a JSON file
234			$outputFile = $outputPath."newsletter_key_val.json";
235			DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue);
236
237			echo(nl2br("Result saved in " . $outputFile . "\n"));
238
239			// Example with customized options:
240			// Extract Keys & Values from pages 2-4, excluding ads
241			$options = new DataExtractionOptions();
242			$options->setPages("2-4");
243
244			$p2ExclusionZones = new RectCollection();
245			// Exclude the ad on page 2
246			// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
247			// Coordinates rotate with the page, if it has rotation applied.
248			$p2ExclusionZones->AddRect(new Rect(166.0, 47.0, 562.0, 222.0));
249			$options->AddExclusionZonesForPage($p2ExclusionZones, 2);
250
251			$p4InclusionZones = new RectCollection();
252			$p4ExclusionZones = new RectCollection();
253			// Only include the article text for page 4, exclude ads and headings
254			$p4InclusionZones->AddRect(new Rect(30.0, 432.0, 562.0, 684.0));
255			$p4ExclusionZones->AddRect(new Rect(30.0, 657.0, 295.0, 684.0));
256			$options->AddInclusionZonesForPage($p4InclusionZones, 4);
257			$options->AddExclusionZonesForPage($p4ExclusionZones, 4);
258
259			echo(nl2br("Extract Key-Value pairs from specific pages and zones as a JSON file\n"));
260			$outputFile = $outputPath."newsletter_key_val_with_zones.json";
261			DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue, $options);
262
263			echo(nl2br("Result saved in " . $outputFile . "\n"));
264		}
265		catch(Exception $e) {
266			echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
267		}
268	}
269
270	//-----------------------------------------------------------------------------------
271
272	PDFNet::Terminate();
273	echo(nl2br("Done.\n"));
274}
275
276main();
277?>
278

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
8// extract various types of data from PDF documents.
9//
10// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
11//---------------------------------------------------------------------------------------
12
13const fs = require('fs');
14const { PDFNet } = require('../../lib/pdfnet.js');
15const PDFTronLicense = require('../../LicenseKey/NODEJS/LicenseKey');
16
17((exports) => {
18	'use strict';
19
20	exports.runDataExtractionTest = () => {
21
22		const main = async () => {
23
24			const inputPath = '../TestFiles/';
25			const outputPath = '../TestFiles/Output/';
26
27			//////////////////////////////////////////////////////////////////////////
28
29			await PDFNet.addResourceSearchPath('../../lib/');
30
31			//////////////////////////////////////////////////////////////////////////
32			// The following sample illustrates how to extract tables from PDF documents.
33			//////////////////////////////////////////////////////////////////////////
34
35			// Test if the add-on is installed
36			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular)) {
37				console.log('\nUnable to run Data Extraction: Apryse SDK Tabular Data module not available.');
38				console.log('---------------------------------------------------------------');
39				console.log('The Data Extraction suite is an optional add-on, available for download');
40				console.log('at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already');
41				console.log('downloaded this module, ensure that the SDK is able to find the required files');
42				console.log('using the PDFNet.addResourceSearchPath() function.\n');
43			}
44			else
45			{
46				try {
47					// Extract tabular data as a JSON file
48					console.log('Extract tabular data as a JSON file');
49
50					let outputFile = outputPath + 'table.json';
51					await PDFNet.DataExtractionModule.extractData(inputPath + 'table.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
52
53					console.log('Result saved in ' + outputFile);
54
55					///////////////////////////////////////////////////////
56					// Extract tabular data as a JSON string
57					console.log('Extract tabular data as a JSON string');
58
59					outputFile = outputPath + 'financial.json';
60					const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'financial.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
61					fs.writeFileSync(outputFile, json);
62
63					console.log('Result saved in ' + outputFile);
64
65					///////////////////////////////////////////////////////
66					// Extract tabular data as an XLSX file
67					console.log('Extract tabular data as an XLSX file');
68
69					outputFile = outputPath + 'table.xlsx';
70					await PDFNet.DataExtractionModule.extractToXLSX(inputPath + 'table.pdf', outputFile);
71
72					console.log('Result saved in ' + outputFile);
73
74					///////////////////////////////////////////////////////
75					// Extract tabular data as an XLSX stream (also known as filter)
76					console.log('Extract tabular data as an XLSX stream');
77
78					outputFile = outputPath + 'financial.xlsx';
79					const outputXlsxStream = await PDFNet.Filter.createMemoryFilter(0, false);
80					const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
81					options.setPages('1'); // page 1
82					await PDFNet.DataExtractionModule.extractToXLSXWithFilter(inputPath + 'financial.pdf', outputXlsxStream, options);
83					outputXlsxStream.memoryFilterSetAsInputFilter();
84					outputXlsxStream.writeToFile(outputFile, false);
85
86					console.log('Result saved in ' + outputFile);
87				} catch (err) {
88					console.log(err);
89				}
90			}
91
92			//////////////////////////////////////////////////////////////////////////
93			// The following sample illustrates how to extract document structure from PDF documents.
94			//////////////////////////////////////////////////////////////////////////
95
96			// Test if the add-on is installed
97			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure)) {
98				console.log('\nUnable to run Data Extraction: Apryse SDK Structured Output module not available.');
99				console.log('---------------------------------------------------------------');
100				console.log('The Data Extraction suite is an optional add-on, available for download');
101				console.log('at https://docs.apryse.com/documentation/core/info/modules/. If you have already');
102				console.log('downloaded this module, ensure that the SDK is able to find the required files');
103				console.log('using the PDFNet.addResourceSearchPath() function.\n');
104			}
105			else
106			{
107				try {
108					// Extract document structure as a JSON file
109					console.log('Extract document structure as a JSON file');
110
111					let outputFile = outputPath + 'paragraphs_and_tables.json';
112					await PDFNet.DataExtractionModule.extractData(inputPath + 'paragraphs_and_tables.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
113
114					console.log('Result saved in ' + outputFile);
115
116					///////////////////////////////////////////////////////
117					// Extract document structure as a JSON string
118					console.log('Extract document structure as a JSON string');
119
120					outputFile = outputPath + 'tagged.json';
121					const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'tagged.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
122					fs.writeFileSync(outputFile, json);
123
124					console.log('Result saved in ' + outputFile);
125				} catch (err) {
126					console.log(err);
127				}
128			}
129
130			//////////////////////////////////////////////////////////////////////////
131			// The following sample illustrates how to extract form fields from PDF documents.
132			//////////////////////////////////////////////////////////////////////////
133
134			// Test if the add-on is installed
135			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Form)) {
136				console.log('\nUnable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.');
137				console.log('---------------------------------------------------------------');
138				console.log('The Data Extraction suite is an optional add-on, available for download');
139				console.log('at https://docs.apryse.com/documentation/core/info/modules/. If you have already');
140				console.log('downloaded this module, ensure that the SDK is able to find the required files');
141				console.log('using the PDFNet.addResourceSearchPath() function.\n');
142			}
143			else
144			{
145				try {
146					// Extract form fields as a JSON file
147					console.log('Extract form fields as a JSON file');
148
149					let outputFile = outputPath + 'formfields-scanned.json';
150					await PDFNet.DataExtractionModule.extractData(inputPath + 'formfields-scanned.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
151
152					console.log('Result saved in ' + outputFile);
153
154					///////////////////////////////////////////////////////
155					// Extract form fields as a JSON string
156					console.log('Extract form fields as a JSON string');
157
158					outputFile = outputPath + 'formfields.json';
159					const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'formfields.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
160					fs.writeFileSync(outputFile, json);
161
162					console.log('Result saved in ' + outputFile);
163
164					//////////////////////////////////////////////////////////////////////////
165					// Detect and add form fields to a PDF document.
166					// Document already has form fields, and this sample will update to new found fields.
167					{
168						console.log('Detect and add form fields in a PDF file, keep new fields');
169
170						const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
171
172						await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
173						outputFile = outputPath + 'formfields-scanned-fields-new.pdf';
174						await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
175
176						console.log('Result saved in ' + outputFile);
177					}
178
179					//////////////////////////////////////////////////////////////////////////
180					// Detect and add form fields to a PDF document.
181					// Document already has form fields, and this sample will keep the original fields.
182					{
183						console.log('Detect and add form fields in a PDF file, keep old fields');
184
185						const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'formfields-scanned-withfields.pdf');
186
187						const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
188						options.setOverlappingFormFieldBehavior('KeepOld');
189
190						await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
191						outputFile = outputPath + 'formfields-scanned-fields-old.pdf';
192						await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
193					}
194
195					console.log('Result saved in ' + outputFile);
196
197				} catch (err) {
198					console.log(err);
199				}
200			}
201
202			//////////////////////////////////////////////////////////////////////////
203			// The following sample illustrates how to extract key-value pairs from PDF documents.
204			//////////////////////////////////////////////////////////////////////////
205			if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue)) {
206				console.log();
207				console.log('Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.');
208				console.log('---------------------------------------------------------------');
209				console.log('The Data Extraction suite is an optional add-on, available for download');
210				console.log('at http://www.pdftron.com/. If you have already downloaded this');
211				console.log('module, ensure that the SDK is able to find the required files');
212				console.log('using the PDFNet.addResourceSearchPath() function.');
213				console.log();
214			}
215			else
216			{
217				try {
218					// Simple example: Extract Keys & Values as a JSON file
219					console.log('Extract Key-Value pairs as a JSON file');
220					await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue);
221					console.log('Result saved in ' + outputPath + 'newsletter_key_val.json');
222					
223					const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
224					options.setPages('2-4');
225				
226					const p2ExclusionZones = [];
227					// Exclude the ad on page 2
228					// These coordinates are in PDF user space, with the origin at the bottom left corner of the page
229					// Coordinates rotate with the page, if it has rotation applied.
230					p2ExclusionZones.push(new PDFNet.Rect(166, 47, 562, 222));
231					options.addExclusionZonesForPage(p2ExclusionZones, 2);
232				
233					const p4InclusionZones = [];
234					const p4ExclusionZones = [];
235					// Only include the article text for page 4, exclude ads and headings
236					p4InclusionZones.push(new PDFNet.Rect(30, 432, 562, 684));
237					p4ExclusionZones.push(new PDFNet.Rect(30, 657, 295, 684));
238					options.addInclusionZonesForPage(p4InclusionZones, 4);
239					options.addExclusionZonesForPage(p4ExclusionZones, 4);
240					console.log('Extract Key-Value pairs from specific pages and zones as a JSON file');
241					await PDFNet.DataExtractionModule.extractData(inputPath + 'newsletter.pdf', outputPath + 'newsletter_key_val_with_zones.json', PDFNet.DataExtractionModule.DataExtractionEngine.e_GenericKeyValue, options);
242					console.log('Result saved in ' + outputPath + 'newsletter_key_val_with_zones.json');
243				} catch (err) {
244					console.log(err);
245				}
246			}
247			//////////////////////////////////////////////////////////////////////////
248
249			console.log('Done.');
250		};
251
252		PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) {
253			console.log('Error: ' + JSON.stringify(error));
254		}).then(function () { return PDFNet.shutdown(); });
255	};
256	exports.runDataExtractionTest();
257})(exports);
258// eslint-disable-next-line spaced-comment
259//# sourceURL=DataExtractionTest.js
260

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11import platform
12
13sys.path.append("../../LicenseKey/PYTHON")
14from LicenseKey import *
15
16#---------------------------------------------------------------------------------------
17# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18# extract various types of data from PDF documents.
19#
20# The Apryse SDK Data Extraction suite can be downloaded from
21# https://docs.apryse.com/core/guides/info/modules#data-extraction-module
22#
23# Please contact us if you have any questions.
24#---------------------------------------------------------------------------------------
25
26# Relative path to the folder containing the test files.
27inputPath = "../../TestFiles/"
28outputPath = "../../TestFiles/Output/"
29
30def WriteTextToFile(outputFile, text):
31    # Write the contents of text to the disk
32    f = open(outputFile, "w")
33    try:
34        f.write(text)
35    finally:
36        f.close()
37
38def main():
39    # The first step in every application using PDFNet is to initialize the 
40    # library. The library is usually initialized only once, but calling 
41    # Initialize() multiple times is also fine.
42    PDFNet.Initialize(LicenseKey)
43    
44    PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
45
46    #-----------------------------------------------------------------------------------
47    # The following sample illustrates how to extract tables from PDF documents.
48    #-----------------------------------------------------------------------------------
49
50    # Test if the add-on is installed
51    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Tabular):
52        print("")
53        print("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
54        print("-----------------------------------------------------------------------------")
55        print("The Data Extraction suite is an optional add-on, available for download")
56        print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
57        print("downloaded this module, ensure that the SDK is able to find the required files")
58        print("using the PDFNet.AddResourceSearchPath() function.")
59        print("")
60    else:
61        try:
62            # Extract tabular data as a JSON file
63            print("Extract tabular data as a JSON file")
64
65            outputFile = outputPath + "table.json"
66            DataExtractionModule.ExtractData(inputPath + "table.pdf", outputFile, DataExtractionModule.e_Tabular)
67
68            print("Result saved in " + outputFile)
69
70            #------------------------------------------------------
71            # Extract tabular data as a JSON string
72            print("Extract tabular data as a JSON string")
73
74            outputFile = outputPath + "financial.json"
75            json = DataExtractionModule.ExtractData(inputPath + "financial.pdf", DataExtractionModule.e_Tabular)
76            WriteTextToFile(outputFile, json)
77
78            print("Result saved in " + outputFile)
79
80            #------------------------------------------------------
81            # Extract tabular data as an XLSX file
82            print("Extract tabular data as an XLSX file")
83
84            outputFile = outputPath + "table.xlsx"
85            DataExtractionModule.ExtractToXLSX(inputPath + "table.pdf", outputFile)
86
87            print("Result saved in " + outputFile)
88
89            #------------------------------------------------------
90            # Extract tabular data as an XLSX stream (also known as filter)
91            print("Extract tabular data as an XLSX stream")
92
93            outputFile = outputPath + "financial.xlsx"
94            options = DataExtractionOptions()
95            options.SetPages("1") # page 1
96            outputXlsxStream = MemoryFilter(0, False)
97            DataExtractionModule.ExtractToXLSX(inputPath + "financial.pdf", outputXlsxStream, options)
98            outputXlsxStream.SetAsInputFilter()
99            outputXlsxStream.WriteToFile(outputFile, False)
100
101            print("Result saved in " + outputFile)
102        except Exception as e:
103            print("Unable to extract tabular data, error: " + str(e))
104
105    #-----------------------------------------------------------------------------------
106    # The following sample illustrates how to extract document structure from PDF documents.
107    #-----------------------------------------------------------------------------------
108
109    # Test if the add-on is installed
110    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocStructure):
111        print("")
112        print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
113        print("-----------------------------------------------------------------------------")
114        print("The Data Extraction suite is an optional add-on, available for download")
115        print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
116        print("downloaded this module, ensure that the SDK is able to find the required files")
117        print("using the PDFNet.AddResourceSearchPath() function.")
118        print("")
119    else:
120        try:
121            # Extract document structure as a JSON file
122            print("Extract document structure as a JSON file")
123
124            outputFile = outputPath + "paragraphs_and_tables.json"
125            DataExtractionModule.ExtractData(inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule.e_DocStructure)
126
127            print("Result saved in " + outputFile)
128
129            #------------------------------------------------------
130            # Extract document structure as a JSON string
131            print("Extract document structure as a JSON string")
132
133            outputFile = outputPath + "tagged.json"
134            json = DataExtractionModule.ExtractData(inputPath + "tagged.pdf", DataExtractionModule.e_DocStructure)
135            WriteTextToFile(outputFile, json)
136
137            print("Result saved in " + outputFile)
138        except Exception as e:
139            print("Unable to extract document structure data, error: " + str(e))
140
141    #-----------------------------------------------------------------------------------
142    # The following sample illustrates how to extract form fields from PDF documents.
143    #-----------------------------------------------------------------------------------
144
145    # Test if the add-on is installed
146    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Form):
147        print("")
148        print("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
149        print("-----------------------------------------------------------------------------")
150        print("The Data Extraction suite is an optional add-on, available for download")
151        print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
152        print("downloaded this module, ensure that the SDK is able to find the required files")
153        print("using the PDFNet.AddResourceSearchPath() function.")
154        print("")
155    else:
156        try:
157            # Extract form fields as a JSON file
158            print("Extract form fields as a JSON file")
159
160            outputFile = outputPath + "formfields-scanned.json"
161            DataExtractionModule.ExtractData(inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule.e_Form)
162
163            print("Result saved in " + outputFile)
164
165            #------------------------------------------------------
166            # Extract form fields as a JSON string
167            print("Extract form fields as a JSON string")
168
169            outputFile = outputPath + "formfields.json"
170            json = DataExtractionModule.ExtractData(inputPath + "formfields.pdf", DataExtractionModule.e_Form)
171            WriteTextToFile(outputFile, json)
172
173            print("Result saved in " + outputFile)
174
175            #-----------------------------------------------------------------------------------
176            # Detect and add form fields to a PDF document.
177            # PDF document already has form fields, and this sample will update to new found fields.
178            print("Extract form fields as a pdf file, update to new")
179
180            doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
181            
182            DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
183            
184            outputFile = outputPath + "formfields-scanned-fields-new.pdf"
185            doc.Save(outputFile, SDFDoc.e_linearized)
186            doc.Close()
187            
188            print("Result saved in " + outputFile)
189
190            #-----------------------------------------------------------------------------------
191            # Detect and add form fields to a PDF document.
192            # PDF document already has form fields, and this sample will keep the original fields.
193            print("Extract form fields as a pdf file, keep original")
194
195            doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
196            
197            options = DataExtractionOptions()
198            options.SetOverlappingFormFieldBehavior("KeepOld")
199            DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
200            
201            outputFile = outputPath + "formfields-scanned-fields-old.pdf"
202            doc.Save(outputFile, SDFDoc.e_linearized)
203            doc.Close()
204            
205            print("Result saved in " + outputFile)
206
207        except Exception as e:
208            print("Unable to extract form fields data, error: " + str(e))
209
210    #---------------------------------------------------------------------------------------
211    # The following sample illustrates how to extract key-value pairs from PDF documents.
212    #---------------------------------------------------------------------------------------
213    if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_GenericKeyValue):
214        print()
215        print("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
216        print("---------------------------------------------------------------")
217        print("The Data Extraction suite is an optional add-on, available for download")
218        print("at http://www.pdftron.com/. If you have already downloaded this")
219        print("module, ensure that the SDK is able to find the required files")
220        print("using the PDFNet.AddResourceSearchPath() function.")
221        print()
222    else:
223        try:
224            print("Extract key-value pairs from a PDF")
225            # Simple example: Extract Keys & Values as a JSON file
226            DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val.json", DataExtractionModule.e_GenericKeyValue)
227            print("Result saved in " + outputPath + "newsletter_key_val.json")
228
229            # Example with customized options:
230            # Extract Keys & Values from pages 2-4, excluding ads
231            options = DataExtractionOptions()
232            options.SetPages("2-4")
233
234            p2_exclusion_zones = RectCollection()
235            # Exclude the ad on page 2
236            # These coordinates are in PDF user space, with the origin at the bottom left corner of the page
237            # Coordinates rotate with the page, if it has rotation applied.
238            p2_exclusion_zones.AddRect(Rect(166, 47, 562, 222))
239            options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
240
241            p4_inclusion_zones = RectCollection()
242            p4_exclusion_zones = RectCollection()
243            # Only include the article text for page 4, exclude ads and headings
244            p4_inclusion_zones.AddRect(Rect(30, 432, 562, 684))
245            p4_exclusion_zones.AddRect(Rect(30, 657, 295, 684))
246            options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
247            options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
248            print("Extract Key-Value pairs from specific pages and zones as a JSON file")
249            DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule.e_GenericKeyValue, options)
250            print("Result saved in " + outputPath + "newsletter_key_val_with_zones.json")
251        except Exception as e:
252                print("Unable to extract key-value data, error: " + str(e))
253
254
255    PDFNet.Terminate()
256    print("Done.")
257    
258if __name__ == '__main__':
259    main()
260

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
14# extract various types of data from PDF documents.
15#
16# The Apryse SDK Data Extraction suite can be downloaded from
17# https://docs.apryse.com/core/guides/info/modules#data-extraction-module
18#
19# Please contact us if you have any questions.
20#---------------------------------------------------------------------------------------
21
22# Relative path to the folder containing the test files.
23$inputPath = "../../TestFiles/"
24$outputPath = "../../TestFiles/Output/"
25	
26def main()
27	# The first step in every application using PDFNet is to initialize the 
28	# library. The library is usually initialized only once, but calling 
29	# Initialize() multiple times is also fine.
30	PDFNet.Initialize(PDFTronLicense.Key)
31
32	PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
33
34	#-----------------------------------------------------------------------------------
35	# The following sample illustrates how to extract tables from PDF documents.
36	#-----------------------------------------------------------------------------------
37
38	# Test if the add-on is installed
39	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Tabular) then
40		puts ""
41		puts "Unable to run Data Extraction: Apryse SDK Tabular Data module not available."
42		puts "-----------------------------------------------------------------------------"
43		puts "The Data Extraction suite is an optional add-on, available for download"
44		puts "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already"
45		puts "downloaded this module, ensure that the SDK is able to find the required files"
46		puts "using the PDFNet.AddResourceSearchPath() function."
47		puts ""
48	else
49		begin
50			# Extract tabular data as a JSON file
51			puts "Extract tabular data as a JSON file"
52	
53			outputFile = $outputPath + "table.json"
54			DataExtractionModule.ExtractData($inputPath + "table.pdf", outputFile, DataExtractionModule::E_Tabular)
55
56			puts "Result saved in " + outputFile
57
58			#------------------------------------------------------
59			# Extract tabular data as a JSON string
60			puts "Extract tabular data as a JSON string"
61	
62			outputFile = $outputPath + "financial.json"
63			json = DataExtractionModule.ExtractData($inputPath + "financial.pdf", DataExtractionModule::E_Tabular)
64			File.open(outputFile, 'w') { |file| file.write(json) }
65	
66			puts "Result saved in " + outputFile
67
68			#------------------------------------------------------
69			# Extract tabular data as an XLSX file
70			puts "Extract tabular data as an XLSX file"
71	
72			outputFile = $outputPath + "table.xlsx"
73			DataExtractionModule.ExtractToXLSX($inputPath + "table.pdf", outputFile)
74	
75			puts "Result saved in " + outputFile
76
77			#------------------------------------------------------
78			# Extract tabular data as an XLSX stream (also known as filter)
79			puts "Extract tabular data as an XLSX stream"
80	
81			outputFile = $outputPath + "financial.xlsx"
82			outputXlsxStream = MemoryFilter.new(0, false)
83			options = DataExtractionOptions.new()
84			options.SetPages("1") # page 1
85			DataExtractionModule.ExtractToXLSX($inputPath + "financial.pdf", outputXlsxStream, options)
86			outputXlsxStream.SetAsInputFilter()
87			outputXlsxStream.WriteToFile(outputFile, false)
88	
89			puts "Result saved in " + outputFile
90		rescue => error
91			puts "Unable to extract tabular data, error: " + error.message
92		end
93	end
94
95	#-----------------------------------------------------------------------------------
96	# The following sample illustrates how to extract document structure from PDF documents.
97	#-----------------------------------------------------------------------------------
98
99	# Test if the add-on is installed
100	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocStructure) then
101		puts ""
102		puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
103		puts "-----------------------------------------------------------------------------"
104		puts "The Data Extraction suite is an optional add-on, available for download"
105		puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
106		puts "downloaded this module, ensure that the SDK is able to find the required files"
107		puts "using the PDFNet.AddResourceSearchPath() function."
108		puts ""
109	else
110		begin
111			# Extract document structure as a JSON file
112			puts "Extract document structure as a JSON file"
113	
114			outputFile = $outputPath + "paragraphs_and_tables.json"
115			DataExtractionModule.ExtractData($inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule::E_DocStructure)
116
117			puts "Result saved in " + outputFile
118
119			#------------------------------------------------------
120			# Extract document structure as a JSON string
121			puts "Extract document structure as a JSON string"
122	
123			outputFile = $outputPath + "tagged.json"
124			json = DataExtractionModule.ExtractData($inputPath + "tagged.pdf", DataExtractionModule::E_DocStructure)
125			File.open(outputFile, 'w') { |file| file.write(json) }
126	
127			puts "Result saved in " + outputFile
128		rescue => error
129			puts "Unable to extract document structure data, error: " + error.message
130		end
131	end
132
133	#-----------------------------------------------------------------------------------
134	# The following sample illustrates how to extract form fields from PDF documents.
135	#-----------------------------------------------------------------------------------
136
137	# Test if the add-on is installed
138	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Form) then
139		puts ""
140		puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
141		puts "-----------------------------------------------------------------------------"
142		puts "The Data Extraction suite is an optional add-on, available for download"
143		puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
144		puts "downloaded this module, ensure that the SDK is able to find the required files"
145		puts "using the PDFNet.AddResourceSearchPath() function."
146		puts ""
147	else
148		begin
149			# Extract form fields as a JSON file
150			puts "Extract form fields as a JSON file"
151	
152			outputFile = $outputPath + "formfields-scanned.json"
153			DataExtractionModule.ExtractData($inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule::E_Form)
154
155			puts "Result saved in " + outputFile
156
157			#------------------------------------------------------
158			# Extract form fields as a JSON string
159			puts "Extract form fields as a JSON string"
160	
161			outputFile = $outputPath + "formfields.json"
162			json = DataExtractionModule.ExtractData($inputPath + "formfields.pdf", DataExtractionModule::E_Form)
163			File.open(outputFile, 'w') { |file| file.write(json) }
164	
165			puts "Result saved in " + outputFile
166			
167			#-----------------------------------------------------------------------------------
168			# Detect and add form fields to a PDF document.
169			# PDF document already has form fields, and this sample will update to the new fields.
170			puts "Extract document structure as a PDF file"
171			doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
172	
173			outputFile = $outputPath + "formfields-scanned-fields-new.pdf"
174			
175			DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
176			doc.Save(outputFile, SDFDoc::E_linearized);
177			doc.Close
178
179			puts "Result saved in " + outputFile
180
181			#-----------------------------------------------------------------------------------
182			# Detect and add form fields to a PDF document.
183			# PDF document already has form fields, and this sample will keep the original fields.
184			puts "Extract document structure as a PDF file"
185			doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
186	
187			outputFile = $outputPath + "formfields-scanned-fields-old.pdf"
188			
189			options = DataExtractionOptions.new()
190			options.SetOverlappingFormFieldBehavior("KeepOld")
191			DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
192			doc.Save(outputFile, SDFDoc::E_linearized);
193			doc.Close
194
195			puts "Result saved in " + outputFile
196
197
198		rescue => error
199			puts "Unable to extract form fields data, error: " + error.message
200		end
201	end
202
203	if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_GenericKeyValue) then
204		puts ""
205		puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
206		puts "-----------------------------------------------------------------------------"
207		puts "The Data Extraction suite is an optional add-on, available for download"
208		puts "at https://docs.apryse.com/documentation/core/info/modules/. If you have already"
209		puts "downloaded this module, ensure that the SDK is able to find the required files"
210		puts "using the PDFNet.AddResourceSearchPath() function."
211		puts ""
212	else
213		begin
214        	puts "Extract key-value pairs from a PDF"
215			# Simple example: Extract Keys & Values as a JSON file
216			DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val.json", DataExtractionModule::E_GenericKeyValue)
217			puts "Result saved in " + $outputPath + "newsletter_key_val.json"
218
219			# Example with customized options:
220			# Extract Keys & Values from pages 2-4, excluding ads
221			options = DataExtractionOptions.new()
222			options.SetPages("2-4")
223
224			p2_exclusion_zones = RectCollection.new()
225			# Exclude the ad on page 2
226			# These coordinates are in PDF user space, with the origin at the bottom left corner of the page
227			# Coordinates rotate with the page, if it has rotation applied.
228			p2_exclusion_zones.AddRect(Rect.new(166, 47, 562, 222))
229			options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
230
231			p4_inclusion_zones = RectCollection.new()
232			p4_exclusion_zones = RectCollection.new()
233			# Only include the article text for page 4, exclude ads and headings
234			p4_inclusion_zones.AddRect(Rect.new(30, 432, 562, 684))
235			p4_exclusion_zones.AddRect(Rect.new(30, 657, 295, 684))
236			options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
237			options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
238			puts "Extract Key-Value pairs from specific pages and zones as a JSON file"
239			DataExtractionModule.ExtractData($inputPath + "newsletter.pdf", $outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule::E_GenericKeyValue, options)
240			puts "Result saved in " + $outputPath + "newsletter_key_val_with_zones.json"
241
242		rescue => error
243			puts "Unable to extract form fields data, error: " + error.message
244		end
245	end
246
247	#-----------------------------------------------------------------------------------
248
249	PDFNet.Terminate
250	puts "Done."
251end
252
253main()
254

1'
2' Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports pdftron
6Imports pdftron.Common
7Imports pdftron.PDF
8Imports pdftron.Filters
9
10' The Data Extraction suite is an optional PDFNet add-on collection that can be used to
11' extract various types of data from PDF documents.
12' The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
13
14Module DataExtractionTestVB
15	Dim pdfNetLoader As PDFNetLoader
16	Sub New()
17		pdfNetLoader = pdftron.PDFNetLoader.Instance()
18	End Sub
19
20	' Relative path to the folder containing test files.
21	Dim input_path As String = "../../../../TestFiles/"
22	Dim output_path As String = "../../../../TestFiles/Output/"
23
24	Sub Main()
25		PDFNet.Initialize(PDFTronLicense.Key)
26		PDFNet.AddResourceSearchPath("../../../../../Lib/")
27
28		TestTabularData()
29		TestDocumentStructure()
30		TestFormFields()
31		TestGenericKeyValue()
32
33		PDFNet.Terminate()
34	End Sub
35
36
37	' The following sample illustrates how to extract tables from PDF documents.
38	Sub TestTabularData()
39		' Test if the add-on is installed
40		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular) Then
41			Console.WriteLine()
42			Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
43			Console.WriteLine("---------------------------------------------------------------")
44			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
45			Console.WriteLine("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this")
46			Console.WriteLine("module, ensure that the SDK is able to find the required files")
47			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
48			Console.WriteLine()
49			Return
50		End If
51
52		Try
53			' Extract tabular data as a JSON file
54			DataExtractionModule.ExtractData(input_path & "table.pdf", output_path & "table.json", DataExtractionModule.DataExtractionEngine.e_tabular)
55
56			' Extract tabular data as a JSON string
57			Dim json As String = DataExtractionModule.ExtractData(input_path & "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular)
58			System.IO.File.WriteAllText(output_path & "financial.json", json)
59
60			' Extract tabular data as an XLSX file
61			DataExtractionModule.ExtractToXLSX(input_path & "table.pdf", output_path & "table.xlsx")
62
63			' Extract tabular data as an XLSX stream (also known as filter)
64			Dim output_xlsx_stream As MemoryFilter = New MemoryFilter(0, False)
65			DataExtractionModule.ExtractToXLSX(input_path & "financial.pdf", output_xlsx_stream)
66			output_xlsx_stream.SetAsInputFilter()
67			output_xlsx_stream.WriteToFile(output_path & "financial.xlsx", False)
68
69		Catch e As PDFNetException
70			Console.WriteLine(e.Message)
71		End Try
72	End Sub
73
74
75	' The following sample illustrates how to extract document structure from PDF documents.
76	Sub TestDocumentStructure()
77		' Test if the add-on is installed
78		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure) Then
79			Console.WriteLine()
80			Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.")
81			Console.WriteLine("---------------------------------------------------------------")
82			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
83			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
84			Console.WriteLine("module, ensure that the SDK is able to find the required files")
85			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
86			Console.WriteLine()
87			Return
88		End If
89
90		Try
91			' Extract document structure as a JSON file
92			DataExtractionModule.ExtractData(input_path & "paragraphs_and_tables.pdf", output_path & "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure)
93
94			' Extract document structure as a JSON string
95			Dim json As String = DataExtractionModule.ExtractData(input_path & "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure)
96			System.IO.File.WriteAllText(output_path & "tagged.json", json)
97
98		Catch e As PDFNetException
99			Console.WriteLine(e.Message)
100		End Try
101	End Sub
102
103
104	' The following sample illustrates how to extract form fields from PDF documents.
105	Sub TestFormFields()
106		' Test if the add-on is installed
107		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form) Then
108			Console.WriteLine()
109			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.")
110			Console.WriteLine("---------------------------------------------------------------")
111			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
112			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
113			Console.WriteLine("module, ensure that the SDK is able to find the required files")
114			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
115			Console.WriteLine()
116			Return
117		End If
118
119		Try
120			' Extract form fields as a JSON file
121			DataExtractionModule.ExtractData(input_path & "formfields-scanned.pdf", output_path & "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form)
122
123			' Extract form fields as a JSON string
124			Dim json As String = DataExtractionModule.ExtractData(input_path & "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form)
125			System.IO.File.WriteAllText(output_path & "formfields.json", json)
126
127			' Detect and add form fields to a PDF document.
128			' PDF document already has form fields, and this sample will update to new found fields.
129			Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
130				DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
131				doc.Save(output_path & "formfields-scanned-fields-new.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
132			End Using
133
134			' Detect and add form fields to a PDF document.
135			' PDF document already has form fields, and this sample will keep the original fields.
136			Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
137				Dim options = New DataExtractionOptions()
138				options.SetOverlappingFormFieldBehavior("KeepOld")
139				DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
140				doc.Save(output_path & "formfields-scanned-fields-old.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
141			End Using
142
143		Catch e As PDFNetException
144			Console.WriteLine(e.Message)
145		End Try
146
147	End Sub
148
149	' The following sample illustrates how to extract key-value pairs from PDF documents.
150	Sub TestGenericKeyValue()
151		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value) Then
152			Console.WriteLine()
153			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
154			Console.WriteLine("---------------------------------------------------------------")
155			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
156			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
157			Console.WriteLine("module, ensure that the SDK is able to find the required files")
158			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
159			Console.WriteLine()
160			Return
161		End If
162
163		' Simple example: Extract Keys & Values as a JSON file
164		DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value)
165
166		' Example with customized options:
167		' Extract Keys & Values from pages 2-4, excluding ads
168		Dim options As New DataExtractionOptions()
169		options.SetPages("2-4")
170
171		Dim p2ExclusionZones As New RectCollection()
172		' Exclude the ad on page 2
173		' These coordinates are in PDF user space, with the origin at the bottom left corner of the page
174		' Coordinates rotate with the page, if it has rotation applied.
175		p2ExclusionZones.AddRect(166, 47, 562, 222)
176		options.AddExclusionZonesForPage(p2ExclusionZones, 2)
177
178		Dim p4InclusionZones As New RectCollection()
179		Dim p4ExclusionZones As New RectCollection()
180		' Only include the article text for page 4, exclude ads and headings
181		p4InclusionZones.AddRect(30, 432, 562, 684)
182		p4ExclusionZones.AddRect(30, 657, 295, 684)
183		options.AddInclusionZonesForPage(p4InclusionZones, 4)
184		options.AddExclusionZonesForPage(p4ExclusionZones, 4)
185
186		DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val_with_zones.json",DataExtractionModule.DataExtractionEngine.e_generic_key_value, options)
187	End Sub
188
189End Module
190

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

Smart Data Extraction