Sample C# how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Learn more about our Server SDK.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7
8using pdftron;
9using pdftron.Common;
10using pdftron.PDF;
11using pdftron.SDF;
12using pdftron.Filters;
13
14namespace DataExtractionTestCS
15{
16 /// <summary>
17 ///---------------------------------------------------------------------------------------
18 /// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
19 /// extract various types of data from PDF documents.
20 ///
21 /// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
22 //---------------------------------------------------------------------------------------
23 /// </summary>
24 class Class1
25 {
26 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
27 static Class1() { }
28
29 // Relative path to the folder containing test files.
30 static string input_path = "../../../../TestFiles/";
31 static string output_path = "../../../../TestFiles/Output/";
32
33
34 /// <summary>
35 /// The following sample illustrates how to extract tables from PDF documents.
36 /// </summary>
37 static void TestTabularData()
38 {
39 // Test if the add-on is installed
40 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
41 {
42 Console.WriteLine();
43 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
46 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine();
50 return;
51 }
52
53 try
54 {
55 // Extract tabular data as a JSON file
56 DataExtractionModule.ExtractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
57
58 // Extract tabular data as a JSON string
59 string json = DataExtractionModule.ExtractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
60 System.IO.File.WriteAllText(output_path + "financial.json", json);
61
62 // Extract tabular data as an XLSX file
63 DataExtractionModule.ExtractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
64
65 // Extract tabular data as an XLSX stream (also known as filter)
66 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
67 DataExtractionModule.ExtractToXLSX(input_path + "financial.pdf", output_xlsx_stream);
68 output_xlsx_stream.SetAsInputFilter();
69 output_xlsx_stream.WriteToFile(output_path + "financial.xlsx", false);
70 }
71 catch (PDFNetException e)
72 {
73 Console.WriteLine(e.Message);
74 }
75 }
76
77
78 /// <summary>
79 // The following sample illustrates how to extract document structure from PDF documents.
80 /// </summary>
81 static void TestDocumentStructure()
82 {
83 // Test if the add-on is installed
84 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
85 {
86 Console.WriteLine();
87 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
88 Console.WriteLine("---------------------------------------------------------------");
89 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
90 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
91 Console.WriteLine("module, ensure that the SDK is able to find the required files");
92 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
93 Console.WriteLine();
94 return;
95 }
96
97 try
98 {
99 // Extract document structure as a JSON file
100 DataExtractionModule.ExtractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
101
102 // Extract document structure as a JSON string
103 string json = DataExtractionModule.ExtractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
104 System.IO.File.WriteAllText(output_path + "tagged.json", json);
105 }
106 catch (PDFNetException e)
107 {
108 Console.WriteLine(e.Message);
109 }
110 }
111
112
113 /// <summary>
114 // The following sample illustrates how to extract form fields from PDF documents.
115 /// </summary>
116 static void TestFormFields()
117 {
118 // Test if the add-on is installed
119 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
120 {
121 Console.WriteLine();
122 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
123 Console.WriteLine("---------------------------------------------------------------");
124 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
125 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
126 Console.WriteLine("module, ensure that the SDK is able to find the required files");
127 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
128 Console.WriteLine();
129 return;
130 }
131
132 try
133 {
134 // Extract form fields as a JSON file
135 DataExtractionModule.ExtractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
136
137 // Extract form fields as a JSON string
138 string json = DataExtractionModule.ExtractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
139 System.IO.File.WriteAllText(output_path + "formfields.json", json);
140
141 // Detect and add form fields to a PDF document.
142 // PDF document already has form fields, and this sample will update to new found fields.
143 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
144 {
145 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc);
146 doc.Save(output_path + "formfields-scanned-new.pdf", SDFDoc.SaveOptions.e_linearized);
147 }
148
149 // Detect and add form fields to a PDF document.
150 // PDF document already has form fields, and this sample will keep the original fields.
151 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
152 {
153 DataExtractionOptions options = new DataExtractionOptions();
154 options.SetOverlappingFormFieldBehavior("KeepOld");
155
156 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options);
157 doc.Save(output_path + "formfields-scanned-old.pdf", SDFDoc.SaveOptions.e_linearized);
158 }
159 }
160 catch (PDFNetException e)
161 {
162 Console.WriteLine(e.Message);
163 }
164 }
165
166
167 /// <summary>
168 /// The main entry point for the application.
169 /// </summary>
170 static void Main(string[] args)
171 {
172 // The first step in every application using PDFNet is to initialize the
173 // library and set the path to common PDF resources. The library is usually
174 // initialized only once, but calling Initialize() multiple times is also fine.
175 PDFNet.Initialize(PDFTronLicense.Key);
176 PDFNet.AddResourceSearchPath("../../../../../Lib/");
177
178 TestTabularData();
179 TestDocumentStructure();
180 TestFormFields();
181
182 PDFNet.Terminate();
183 }
184 }
185}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30 ofstream out_file(filename.c_str(), ofstream::binary);
31 string out_buf = text.ConvertToUtf8();
32 out_file.write(out_buf.c_str(), out_buf.size());
33 out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45 // Test if the add-on is installed
46 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47 {
48 cout << endl;
49 cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50 cout << "---------------------------------------------------------------" << endl;
51 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
53 cout << "module, ensure that the SDK is able to find the required files" << endl;
54 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55 return;
56 }
57
58 // Extract tabular data as a JSON file
59 DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61 // Extract tabular data as a JSON string
62 UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63 WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65 // Extract tabular data as an XLSX file
66 DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68 // Extract tabular data as an XLSX stream (also known as filter)
69 MemoryFilter output_xlsx_stream(0, false);
70 DataExtractionOptions options;
71 options.SetPages("1"); // extract page 1
72 DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73 output_xlsx_stream.SetAsInputFilter();
74 output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82 // Test if the add-on is installed
83 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84 {
85 cout << endl;
86 cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87 cout << "---------------------------------------------------------------" << endl;
88 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
90 cout << "module, ensure that the SDK is able to find the required files" << endl;
91 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92 return;
93 }
94
95 // Extract document structure as a JSON file
96 DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98 // Extract document structure as a JSON string
99 UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100 WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108 // Test if the add-on is installed
109 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110 {
111 cout << endl;
112 cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113 cout << "---------------------------------------------------------------" << endl;
114 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
116 cout << "module, ensure that the SDK is able to find the required files" << endl;
117 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118 return;
119 }
120
121 // Extract form fields as a JSON file
122 DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124 // Extract form fields as a JSON string
125 UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126 WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128 //---------------------------------------------------------------------------------------
129 // Detect and add form fields to a PDF document.
130 // PDF document already has form fields, and this sample will update to new found fields.
131 //---------------------------------------------------------------------------------------
132 {
133 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137 // Save the modfied pdf document
138 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139 }
140
141 //---------------------------------------------------------------------------------------
142 // Detect and add form fields to a PDF document.
143 // PDF document already has form fields, and this sample will keep the original fields.
144 //---------------------------------------------------------------------------------------
145 {
146 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148 // Setup DataExtractionOptions to keep old fields
149 DataExtractionOptions options;
150 options.SetOverlappingFormFieldBehavior("KeepOld");
151
152 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154 // Save the modfied pdf document
155 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156 }
157}
158
159int main(int argc, char* argv[])
160{
161 // The first step in every application using PDFNet is to initialize the
162 // library and set the path to common PDF resources. The library is usually
163 // initialized only once, but calling Initialize() multiple times is also fine.
164 PDFNet::Initialize(LicenseKey);
165
166 int ret = 0;
167
168 try
169 {
170 PDFNet::AddResourceSearchPath("../../../Lib/");
171
172 TestTabularData();
173 TestDocumentStructure();
174 TestFormFields();
175 }
176 catch (Common::Exception& e)
177 {
178 cout << e << endl;
179 ret = 1;
180 }
181 catch (...)
182 {
183 cout << "Unknown Exception" << endl;
184 ret = 1;
185 }
186
187 PDFNet::Terminate();
188
189 return ret;
190}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "os"
10 . "pdftron"
11)
12
13import "pdftron/Samples/LicenseKey/GO"
14
15//---------------------------------------------------------------------------------------
16// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
17// extract various types of data from PDF documents.
18//
19// The PDFTron SDK Data Extraction suite can be downloaded from
20// https://docs.apryse.com/core/info/modules/
21//
22// Please contact us if you have any questions.
23//---------------------------------------------------------------------------------------
24
25// Relative path to the folder containing the test files.
26var inputPath = "../../TestFiles/"
27var outputPath = "../../TestFiles/Output/"
28
29//---------------------------------------------------------------------------------------
30
31func catch(err *error) {
32 if r := recover(); r != nil {
33 *err = fmt.Errorf("%v", r)
34 }
35}
36
37//---------------------------------------------------------------------------------------
38
39func WriteTextToFile(outputFile string, text string) {
40 f, err := os.Create(outputFile)
41 if err != nil {
42 fmt.Println(err)
43 }
44
45 defer f.Close()
46
47 _, err2 := f.WriteString(text)
48 if err2 != nil {
49 fmt.Println(err2)
50 }
51}
52
53//---------------------------------------------------------------------------------------
54// The following sample illustrates how to extract tables from PDF documents.
55//---------------------------------------------------------------------------------------
56
57func TestTabularData() (err error) {
58 defer catch(&err)
59
60 // Test if the add-on is installed
61 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Tabular) {
62 fmt.Println("")
63 fmt.Println("Unable to run Data Extraction: PDFTron SDK Tabular Data module not available.")
64 fmt.Println("-----------------------------------------------------------------------------")
65 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
66 fmt.Println("at https://docs.apryse.com/core/info/modules/. If you have already")
67 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
68 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
69 fmt.Println("")
70 return nil
71 }
72
73 // Extract tabular data as a JSON file
74 fmt.Println("Extract tabular data as a JSON file")
75
76 inputFile := inputPath + "table.pdf"
77 outputFile := outputPath + "table.json"
78 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Tabular)
79
80 fmt.Println("Result saved in " + outputFile)
81
82 // Extract tabular data as a JSON string
83 fmt.Println("Extract tabular data as a JSON string")
84
85 inputFile = inputPath + "financial.pdf"
86 outputFile = outputPath + "financial.json"
87
88 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Tabular).(string)
89 WriteTextToFile(outputFile, json)
90
91 fmt.Println("Result saved in " + outputFile)
92
93 // Extract tabular data as an XLSX file
94 fmt.Println("Extract tabular data as an XLSX file")
95
96 inputFile = inputPath + "table.pdf"
97 outputFile = outputPath + "table.xlsx"
98 DataExtractionModuleExtractToXSLX(inputFile, outputFile)
99
100 fmt.Println("Result saved in " + outputFile)
101
102 // Extract tabular data as an XLSX stream (also known as filter)
103 fmt.Println("Extract tabular data as an XLSX stream")
104
105 inputFile = inputPath + "financial.pdf"
106 outputFile = outputPath + "financial.xlsx"
107 outputXlsxStream := NewMemoryFilter(0, false)
108 options := NewDataExtractionOptions()
109 options.SetPages("1"); // page 1
110 DataExtractionModuleExtractToXSLX(inputFile, outputXlsxStream, options)
111 outputXlsxStream.SetAsInputFilter()
112 outputXlsxStream.WriteToFile(outputFile, false)
113
114 fmt.Println("Result saved in " + outputFile)
115
116 return nil
117}
118
119//---------------------------------------------------------------------------------------
120// The following sample illustrates how to extract document structure from PDF documents.
121//---------------------------------------------------------------------------------------
122
123func TestDocumentStructure() (err error) {
124 defer catch(&err)
125
126 // Test if the add-on is installed
127 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_DocStructure) {
128 fmt.Println("")
129 fmt.Println("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
130 fmt.Println("-----------------------------------------------------------------------------")
131 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
132 fmt.Println("at https://docs.apryse.com/core/info/modules/. If you have already")
133 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
134 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
135 fmt.Println("")
136 return nil
137 }
138
139 // Extract document structure as a JSON file
140 fmt.Println("Extract document structure as a JSON file")
141
142 inputFile := inputPath + "paragraphs_and_tables.pdf"
143 outputFile := outputPath + "paragraphs_and_tables.json"
144 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_DocStructure)
145
146 fmt.Println("Result saved in " + outputFile)
147
148 // Extract document structure as a JSON string
149 fmt.Println("Extract document structure as a JSON string")
150
151 inputFile = inputPath + "tagged.pdf"
152 outputFile = outputPath + "tagged.json"
153 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_DocStructure).(string)
154 WriteTextToFile(outputFile, json)
155
156 fmt.Println("Result saved in " + outputFile)
157
158 return nil
159}
160
161//---------------------------------------------------------------------------------------
162// The following sample illustrates how to extract form fields from PDF documents.
163//---------------------------------------------------------------------------------------
164
165func TestFormFields() (err error) {
166 defer catch(&err)
167
168 // Test if the add-on is installed
169 if !DataExtractionModuleIsModuleAvailable(DataExtractionModuleE_Form) {
170 fmt.Println("")
171 fmt.Println("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
172 fmt.Println("-----------------------------------------------------------------------------")
173 fmt.Println("The Data Extraction suite is an optional add-on, available for download")
174 fmt.Println("at https://docs.apryse.com/core/info/modules/. If you have already")
175 fmt.Println("downloaded this module, ensure that the SDK is able to find the required files")
176 fmt.Println("using the PDFNetAddResourceSearchPath() function.")
177 fmt.Println("")
178 return nil
179 }
180
181 // Extract form fields as a JSON file
182 fmt.Println("Extract form fields as a JSON file")
183
184 inputFile := inputPath + "formfields-scanned.pdf"
185 outputFile := outputPath + "formfields-scanned.json"
186 DataExtractionModuleExtractData(inputFile, outputFile, DataExtractionModuleE_Form)
187
188 fmt.Println("Result saved in " + outputFile)
189
190 // Extract form fields as a JSON string
191 fmt.Println("Extract form fields as a JSON string")
192
193 inputFile = inputPath + "formfields.pdf"
194 outputFile = outputPath + "formfields.json"
195
196 json := DataExtractionModuleExtractData(inputFile, DataExtractionModuleE_Form).(string)
197 WriteTextToFile(outputFile, json)
198
199 fmt.Println("Result saved in " + outputFile)
200
201 return nil
202}
203
204//---------------------------------------------------------------------------------------
205
206func main() {
207 // The first step in every application using PDFNet is to initialize the
208 // library. The library is usually initialized only once, but calling
209 // Initialize() multiple times is also fine.
210 PDFNetInitialize(PDFTronLicense.Key)
211
212 //-----------------------------------------------------------------------------------
213
214 PDFNetAddResourceSearchPath("../../../PDFNetC/Lib/")
215
216 //-----------------------------------------------------------------------------------
217
218 err := TestTabularData()
219 if err != nil {
220 fmt.Println(fmt.Errorf("Unable to extract tabular data, error: %s", err))
221 }
222
223 //-----------------------------------------------------------------------------------
224
225 err = TestDocumentStructure()
226 if err != nil {
227 fmt.Println(fmt.Errorf("Unable to extract document structure data, error: %s", err))
228 }
229
230 //-----------------------------------------------------------------------------------
231
232 err = TestFormFields()
233 if err != nil {
234 fmt.Println(fmt.Errorf("Unable to extract form fields data, error: %s", err))
235 }
236
237 //-----------------------------------------------------------------------------------
238
239 PDFNetTerminate()
240 fmt.Println("Done.")
241}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.io.FileWriter;
7import java.io.BufferedWriter;
8import java.io.FileNotFoundException;
9import java.io.IOException;
10
11import com.pdftron.common.PDFNetException;
12import com.pdftron.pdf.*;
13import com.pdftron.filters.*;
14import com.pdftron.sdf.SDFDoc;
15
16//---------------------------------------------------------------------------------------
17// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18// extract various types of data from PDF documents.
19//
20// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
21//---------------------------------------------------------------------------------------
22
23public class DataExtractionTest {
24
25 static void writeTextToFile(String filename, String text) throws IOException
26 {
27 BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
28 writer.write(text);
29 writer.close();
30 }
31
32 //---------------------------------------------------------------------------------------
33 // The following sample illustrates how to extract tables from PDF documents.
34 //---------------------------------------------------------------------------------------
35 static void testTabularData()
36 {
37 try {
38 // Test if the add-on is installed
39 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
40 {
41 System.out.println();
42 System.out.println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
43 System.out.println("---------------------------------------------------------------");
44 System.out.println("The Data Extraction suite is an optional add-on, available for download");
45 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
46 System.out.println("module, ensure that the SDK is able to find the required files");
47 System.out.println("using the PDFNet.addResourceSearchPath() function." );
48 System.out.println();
49 return;
50 }
51 } catch (PDFNetException e) {
52 System.out.println("Data Extraction module not available, error:");
53 e.printStackTrace();
54 System.out.println(e);
55 }
56
57 // Relative path to the folder containing test files.
58 String input_path = "../../TestFiles/";
59 String output_path = "../../TestFiles/Output/";
60
61 try {
62 // Extract tabular data as a JSON file
63 DataExtractionModule.extractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
64
65 // Extract tabular data as a JSON string
66 String json = DataExtractionModule.extractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
67 writeTextToFile(output_path + "financial.json", json);
68
69 // Extract tabular data as an XLSX file
70 DataExtractionModule.extractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
71
72 // Extract tabular data as an XLSX stream (also known as filter)
73 DataExtractionOptions options = new DataExtractionOptions();
74 options.setPages("1");
75 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
76 DataExtractionModule.extractToXLSX(input_path + "financial.pdf", output_xlsx_stream, options);
77 output_xlsx_stream.setAsInputFilter();
78 output_xlsx_stream.writeToFile(output_path + "financial.xlsx", false);
79
80 } catch (PDFNetException e) {
81 System.out.println(e);
82 }
83 catch (IOException e) {
84 System.out.println(e);
85 }
86 }
87
88 //---------------------------------------------------------------------------------------
89 // The following sample illustrates how to extract document structure from PDF documents.
90 //---------------------------------------------------------------------------------------
91 static void testDocumentStructure()
92 {
93 // Test if the add-on is installed
94 try {
95 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
96 {
97 System.out.println();
98 System.out.println("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
99 System.out.println("---------------------------------------------------------------");
100 System.out.println("The Data Extraction suite is an optional add-on, available for download");
101 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
102 System.out.println("module, ensure that the SDK is able to find the required files");
103 System.out.println("using the PDFNet.addResourceSearchPath() function." );
104 System.out.println();
105 return;
106 }
107 } catch (PDFNetException e) {
108 System.out.println("Data Extraction module not available, error:");
109 e.printStackTrace();
110 System.out.println(e);
111 }
112
113 // Relative path to the folder containing test files.
114 String input_path = "../../TestFiles/";
115 String output_path = "../../TestFiles/Output/";
116
117 try {
118 // Extract document structure as a JSON file
119 DataExtractionModule.extractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
120
121 // Extract document structure as a JSON string
122 String json = DataExtractionModule.extractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
123 writeTextToFile(output_path + "tagged.json", json);
124
125 } catch (PDFNetException e) {
126 System.out.println(e);
127 }
128 catch (IOException e) {
129 System.out.println(e);
130 }
131 }
132
133 //---------------------------------------------------------------------------------------
134 // The following sample illustrates how to extract form fields from PDF documents.
135 //---------------------------------------------------------------------------------------
136 static void testFormFields()
137 {
138 try {
139 // Test if the add-on is installed
140 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
141 {
142 System.out.println();
143 System.out.println("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
144 System.out.println("---------------------------------------------------------------");
145 System.out.println("The Data Extraction suite is an optional add-on, available for download");
146 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
147 System.out.println("module, ensure that the SDK is able to find the required files");
148 System.out.println("using the PDFNet.addResourceSearchPath() function." );
149 System.out.println();
150 return;
151 }
152 } catch (PDFNetException e) {
153 System.out.println("Data Extraction module not available, error:");
154 e.printStackTrace();
155 System.out.println(e);
156 }
157
158 // Relative path to the folder containing test files.
159 String input_path = "../../TestFiles/";
160 String output_path = "../../TestFiles/Output/";
161
162 try {
163 // Extract form fields as a JSON file
164 DataExtractionModule.extractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
165
166 // Extract form fields as a JSON string
167 String json = DataExtractionModule.extractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
168 writeTextToFile(output_path + "formfields.json", json);
169
170 //---------------------------------------------------------------------------------------
171 // Detect and add form fields to a PDF document.
172 // PDF document already has form fields, and this sample will update to new found fields.
173 //---------------------------------------------------------------------------------------
174 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
175 {
176 DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
177
178 // Save the modfied pdf document
179 doc.save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveMode.LINEARIZED, null);
180 } catch (Exception e) {
181 e.printStackTrace();
182 }
183
184 //---------------------------------------------------------------------------------------
185 // Detect and add form fields to a PDF document.
186 // PDF document already has form fields, and this sample will keep the original fields.
187 //---------------------------------------------------------------------------------------
188 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
189 {
190 // Setup DataExtractionOptions to keep old fields
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.setOverlappingFormFieldBehavior("KeepOld");
193
194 DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
195
196 // Save the modfied pdf document
197 doc.save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198 } catch (Exception e) {
199 e.printStackTrace();
200 }
201
202 } catch (PDFNetException e) {
203 System.out.println(e);
204 }
205 catch (IOException e) {
206 System.out.println(e);
207 }
208 }
209
210 public static void main(String[] args)
211 {
212 // The first step in every application using PDFNet is to initialize the
213 // library and set the path to common PDF resources. The library is usually
214 // initialized only once, but calling initialize() multiple times is also fine.
215 PDFNet.initialize(PDFTronLicense.Key());
216 PDFNet.addResourceSearchPath("../../../Lib/");
217
218 testTabularData();
219 testDocumentStructure();
220 testFormFields();
221
222 PDFNet.terminate();
223 }
224}
1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10//---------------------------------------------------------------------------------------
11// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
12// extract various types of data from PDF documents.
13//
14// The PDFTron SDK Data Extraction suite can be downloaded from
15// https://docs.apryse.com/core/info/modules/
16//
17// Please contact us if you have any questions.
18//---------------------------------------------------------------------------------------
19
20function WriteTextToFile($outputFile, $text)
21{
22 $outfile = fopen($outputFile, "w");
23 fwrite($outfile, $text);
24 fclose($outfile);
25}
26
27function main()
28{
29 // Relative path to the folder containing the test files.
30 $inputPath = getcwd()."/../../TestFiles/";
31 $outputPath = $inputPath."Output/";
32
33 // The first step in every application using PDFNet is to initialize the
34 // library. The library is usually initialized only once, but calling
35 // Initialize() multiple times is also fine.
36 global $LicenseKey;
37 PDFNet::Initialize($LicenseKey);
38 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
39
40 //-----------------------------------------------------------------------------------
41
42 PDFNet::AddResourceSearchPath("../../../PDFNetC/Lib/");
43
44 //////////////////////////////////////////////////////////////////////////
45 // The following sample illustrates how to extract tables from PDF documents.
46 //////////////////////////////////////////////////////////////////////////
47
48 // Test if the add-on is installed
49 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular)) {
50 echo(nl2br("\n"));
51 echo(nl2br("Unable to run Data Extraction: PDFTron SDK Tabular Data module not available.\n"));
52 echo(nl2br("-----------------------------------------------------------------------------\n"));
53 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
54 echo(nl2br("at https://docs.apryse.com/core/info/modules/. If you have already\n"));
55 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
56 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
57 echo(nl2br("\n"));
58 }
59 else {
60 try {
61 // Extract tabular data as a JSON file
62 echo(nl2br("Extract tabular data as a JSON file\n"));
63
64 $outputFile = $outputPath."table.json";
65 DataExtractionModule::ExtractData($inputPath."table.pdf", $outputFile, DataExtractionModule::e_Tabular);
66
67 echo(nl2br("Result saved in " . $outputFile . "\n"));
68
69 ///////////////////////////////////////////////////////
70 // Extract tabular data as a JSON string
71 echo(nl2br("Extract tabular data as a JSON string\n"));
72
73 $outputFile = $outputPath."financial.json";
74 $json = DataExtractionModule::ExtractData($inputPath."financial.pdf", DataExtractionModule::e_Tabular);
75 WriteTextToFile($outputFile, $json);
76
77 echo(nl2br("Result saved in " . $outputFile . "\n"));
78
79 ///////////////////////////////////////////////////////
80 // Extract tabular data as an XLSX file
81 echo(nl2br("Extract tabular data as an XLSX file\n"));
82
83 $outputFile = $outputPath."table.xlsx";
84 DataExtractionModule::ExtractToXLSX($inputPath."table.pdf", $outputFile);
85
86 echo(nl2br("Result saved in " . $outputFile . "\n"));
87
88 ///////////////////////////////////////////////////////
89 // Extract tabular data as an XLSX stream (also known as filter)
90 echo(nl2br("Extract tabular data as an XLSX stream\n"));
91
92 $outputFile = $outputPath."financial.xlsx";
93 $outputXlsxStream = new MemoryFilter(0, false);
94 $options = new DataExtractionOptions();
95 $options->SetPages("1"); // page 1
96 DataExtractionModule::ExtractToXLSX($inputPath."financial.pdf", $outputXlsxStream, $options);
97 $outputXlsxStream->SetAsInputFilter();
98 $outputXlsxStream->WriteToFile($outputFile, false);
99
100 echo(nl2br("Result saved in " . $outputFile . "\n"));
101 }
102 catch(Exception $e) {
103 echo(nl2br("Unable to extract tabular data, error: " . $e->getMessage() . "\n"));
104 }
105 }
106
107 //////////////////////////////////////////////////////////////////////////
108 // The following sample illustrates how to extract document structure from PDF documents.
109 //////////////////////////////////////////////////////////////////////////
110
111 // Test if the add-on is installed
112 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure)) {
113 echo(nl2br("\n"));
114 echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
115 echo(nl2br("-----------------------------------------------------------------------------\n"));
116 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
117 echo(nl2br("at https://docs.apryse.com/core/info/modules/. If you have already\n"));
118 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
119 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
120 echo(nl2br("\n"));
121 }
122 else {
123 try {
124 // Extract document structure as a JSON file
125 echo(nl2br("Extract document structure as a JSON file\n"));
126
127 $outputFile = $outputPath."paragraphs_and_tables.json";
128 DataExtractionModule::ExtractData($inputPath."paragraphs_and_tables.pdf", $outputFile, DataExtractionModule::e_DocStructure);
129
130 echo(nl2br("Result saved in " . $outputFile . "\n"));
131
132 ///////////////////////////////////////////////////////
133 // Extract document structure as a JSON string
134 echo(nl2br("Extract document structure as a JSON string\n"));
135
136 $outputFile = $outputPath."tagged.json";
137 $json = DataExtractionModule::ExtractData($inputPath."tagged.pdf", DataExtractionModule::e_DocStructure);
138 WriteTextToFile($outputFile, $json);
139
140 echo(nl2br("Result saved in " . $outputFile . "\n"));
141 }
142 catch(Exception $e) {
143 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
144 }
145 }
146
147 //////////////////////////////////////////////////////////////////////////
148 // The following sample illustrates how to extract form fields from PDF documents.
149 //////////////////////////////////////////////////////////////////////////
150
151 // Test if the add-on is installed
152 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form)) {
153 echo(nl2br("\n"));
154 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.\n"));
155 echo(nl2br("-----------------------------------------------------------------------------\n"));
156 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
157 echo(nl2br("at https://docs.apryse.com/core/info/modules/. If you have already\n"));
158 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
159 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
160 echo(nl2br("\n"));
161 }
162 else {
163 try {
164 // Extract form fields as a JSON file
165 echo(nl2br("Extract form fields as a JSON file\n"));
166
167 $outputFile = $outputPath."formfields-scanned.json";
168 DataExtractionModule::ExtractData($inputPath."formfields-scanned.pdf", $outputFile, DataExtractionModule::e_Form);
169
170 echo(nl2br("Result saved in " . $outputFile . "\n"));
171
172 ///////////////////////////////////////////////////////
173 // Extract form fields as a JSON string
174 echo(nl2br("Extract form fields as a JSON string\n"));
175
176 $outputFile = $outputPath."formfields.json";
177 $json = DataExtractionModule::ExtractData($inputPath."formfields.pdf", DataExtractionModule::e_Form);
178 WriteTextToFile($outputFile, $json);
179
180 echo(nl2br("Result saved in " . $outputFile . "\n"));
181
182 ///////////////////////////////////////////////////////
183 // Detect and add form fields to a PDF document.
184 // PDF document already has form fields, and this sample will update to new found fields.
185 echo(nl2br("Extract form fields as a PDF file\n"));
186
187 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
188 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc);
189 $doc->Save($outputPath."formfields-scanned-fields-new.pdf", SDFDoc::e_linearized);
190 $doc->Close();
191
192 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-new.pdf" . "\n"));
193
194 ///////////////////////////////////////////////////////
195 // Detect and add form fields to a PDF document.
196 // PDF document already has form fields, and this sample will keep the original fields.
197 echo(nl2br("Extract form fields as a PDF file\n"));
198
199 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
200 $options = new DataExtractionOptions();
201 $options->SetOverlappingFormFieldBehavior("KeepOld");
202 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc, $options);
203 $doc->Save($outputPath."formfields-scanned-fields-old.pdf", SDFDoc::e_linearized);
204 $doc->Close();
205
206 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-old.pdf" . "\n"));
207
208 }
209 catch(Exception $e) {
210 echo(nl2br("Unable to extract form fields data, error: " . $e->getMessage() . "\n"));
211 }
212 }
213
214 //-----------------------------------------------------------------------------------
215
216 PDFNet::Terminate();
217 echo(nl2br("Done.\n"));
218}
219
220main();
221?>
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6//---------------------------------------------------------------------------------------
7// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
8// extract various types of data from PDF documents.
9//
10// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
11//---------------------------------------------------------------------------------------
12
13const fs = require('fs');
14const { PDFNet } = require('@pdftron/pdfnet-node');
15const PDFTronLicense = require('../LicenseKey/LicenseKey');
16
17((exports) => {
18 'use strict';
19
20 exports.runDataExtractionTest = () => {
21
22 const main = async () => {
23
24 const inputPath = '../TestFiles/';
25 const outputPath = '../TestFiles/Output/';
26
27 //////////////////////////////////////////////////////////////////////////
28
29 await PDFNet.addResourceSearchPath('../../lib/');
30
31 //////////////////////////////////////////////////////////////////////////
32 // The following sample illustrates how to extract tables from PDF documents.
33 //////////////////////////////////////////////////////////////////////////
34
35 // Test if the add-on is installed
36 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular)) {
37 console.log('\nUnable to run Data Extraction: Apryse SDK Tabular Data module not available.');
38 console.log('---------------------------------------------------------------');
39 console.log('The Data Extraction suite is an optional add-on, available for download');
40 console.log('at https://docs.apryse.com/core/info/modules/. If you have already');
41 console.log('downloaded this module, ensure that the SDK is able to find the required files');
42 console.log('using the PDFNet.addResourceSearchPath() function.\n');
43 }
44 else
45 {
46 try {
47 // Extract tabular data as a JSON file
48 console.log('Extract tabular data as a JSON file');
49
50 let outputFile = outputPath + 'table.json';
51 await PDFNet.DataExtractionModule.extractData(inputPath + 'table.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
52
53 console.log('Result saved in ' + outputFile);
54
55 ///////////////////////////////////////////////////////
56 // Extract tabular data as a JSON string
57 console.log('Extract tabular data as a JSON string');
58
59 outputFile = outputPath + 'financial.json';
60 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'financial.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Tabular);
61 fs.writeFileSync(outputFile, json);
62
63 console.log('Result saved in ' + outputFile);
64
65 ///////////////////////////////////////////////////////
66 // Extract tabular data as an XLSX file
67 console.log('Extract tabular data as an XLSX file');
68
69 outputFile = outputPath + 'table.xlsx';
70 await PDFNet.DataExtractionModule.extractToXLSX(inputPath + 'table.pdf', outputFile);
71
72 console.log('Result saved in ' + outputFile);
73
74 ///////////////////////////////////////////////////////
75 // Extract tabular data as an XLSX stream (also known as filter)
76 console.log('Extract tabular data as an XLSX stream');
77
78 outputFile = outputPath + 'financial.xlsx';
79 const outputXlsxStream = await PDFNet.Filter.createMemoryFilter(0, false);
80 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
81 options.setPages("1"); // page 1
82 await PDFNet.DataExtractionModule.extractToXLSXWithFilter(inputPath + 'financial.pdf', outputXlsxStream, options);
83 outputXlsxStream.memoryFilterSetAsInputFilter();
84 outputXlsxStream.writeToFile(outputFile, false);
85
86 console.log('Result saved in ' + outputFile);
87 } catch (err) {
88 console.log(err);
89 }
90 }
91
92 //////////////////////////////////////////////////////////////////////////
93 // The following sample illustrates how to extract document structure from PDF documents.
94 //////////////////////////////////////////////////////////////////////////
95
96 // Test if the add-on is installed
97 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure)) {
98 console.log('\nUnable to run Data Extraction: Apryse SDK Structured Output module not available.');
99 console.log('---------------------------------------------------------------');
100 console.log('The Data Extraction suite is an optional add-on, available for download');
101 console.log('at https://docs.apryse.com/core/info/modules/. If you have already');
102 console.log('downloaded this module, ensure that the SDK is able to find the required files');
103 console.log('using the PDFNet.addResourceSearchPath() function.\n');
104 }
105 else
106 {
107 try {
108 // Extract document structure as a JSON file
109 console.log('Extract document structure as a JSON file');
110
111 let outputFile = outputPath + 'paragraphs_and_tables.json';
112 await PDFNet.DataExtractionModule.extractData(inputPath + 'paragraphs_and_tables.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
113
114 console.log('Result saved in ' + outputFile);
115
116 ///////////////////////////////////////////////////////
117 // Extract document structure as a JSON string
118 console.log('Extract document structure as a JSON string');
119
120 outputFile = outputPath + 'tagged.json';
121 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'tagged.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_DocStructure);
122 fs.writeFileSync(outputFile, json);
123
124 console.log('Result saved in ' + outputFile);
125 } catch (err) {
126 console.log(err);
127 }
128 }
129
130 //////////////////////////////////////////////////////////////////////////
131 // The following sample illustrates how to extract form fields from PDF documents.
132 //////////////////////////////////////////////////////////////////////////
133
134 // Test if the add-on is installed
135 if (!await PDFNet.DataExtractionModule.isModuleAvailable(PDFNet.DataExtractionModule.DataExtractionEngine.e_Form)) {
136 console.log('\nUnable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.');
137 console.log('---------------------------------------------------------------');
138 console.log('The Data Extraction suite is an optional add-on, available for download');
139 console.log('at https://docs.apryse.com/core/info/modules/. If you have already');
140 console.log('downloaded this module, ensure that the SDK is able to find the required files');
141 console.log('using the PDFNet.addResourceSearchPath() function.\n');
142 }
143 else
144 {
145 try {
146 // Extract form fields as a JSON file
147 console.log('Extract form fields as a JSON file');
148
149 let outputFile = outputPath + 'formfields-scanned.json';
150 await PDFNet.DataExtractionModule.extractData(inputPath + 'formfields-scanned.pdf', outputFile, PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
151
152 console.log('Result saved in ' + outputFile);
153
154 ///////////////////////////////////////////////////////
155 // Extract form fields as a JSON string
156 console.log('Extract form fields as a JSON string');
157
158 outputFile = outputPath + 'formfields.json';
159 const json = await PDFNet.DataExtractionModule.extractDataAsString(inputPath + 'formfields.pdf', PDFNet.DataExtractionModule.DataExtractionEngine.e_Form);
160 fs.writeFileSync(outputFile, json);
161
162 console.log('Result saved in ' + outputFile);
163
164 //////////////////////////////////////////////////////////////////////////
165 // Detect and add form fields to a PDF document.
166 // Document already has form fields, and this sample will update to new found fields.
167 {
168 console.log('Detect and add form fields in a PDF file, keep new fields');
169
170 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + "formfields-scanned-withfields.pdf");
171
172 await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
173 outputFile = outputPath + 'formfields-scanned-fields-new.pdf';
174 await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
175
176 console.log('Result saved in ' + outputFile);
177 }
178
179 //////////////////////////////////////////////////////////////////////////
180 // Detect and add form fields to a PDF document.
181 // Document already has form fields, and this sample will keep the original fields.
182 {
183 console.log('Detect and add form fields in a PDF file, keep old fields');
184
185 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + "formfields-scanned-withfields.pdf");
186
187 const options = new PDFNet.DataExtractionModule.DataExtractionOptions();
188 options.setOverlappingFormFieldBehavior('KeepOld');
189
190 await PDFNet.DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
191 outputFile = outputPath + 'formfields-scanned-fields-old.pdf';
192 await doc.save(outputFile, PDFNet.SDFDoc.SaveOptions.e_linearized);
193 }
194
195 console.log('Result saved in ' + outputFile);
196
197 } catch (err) {
198 console.log(err);
199 }
200 }
201
202 //////////////////////////////////////////////////////////////////////////
203
204 console.log('Done.');
205 };
206
207 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) {
208 console.log('Error: ' + JSON.stringify(error));
209 }).then(function () { return PDFNet.shutdown(); });
210 };
211 exports.runDataExtractionTest();
212})(exports);
213// eslint-disable-next-line spaced-comment
214//# sourceURL=DataExtractionTest.js
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11import platform
12
13sys.path.append("../../LicenseKey/PYTHON")
14from LicenseKey import *
15
16#---------------------------------------------------------------------------------------
17# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18# extract various types of data from PDF documents.
19#
20# The PDFTron SDK Data Extraction suite can be downloaded from
21# https://docs.apryse.com/core/info/modules/
22#
23# Please contact us if you have any questions.
24#---------------------------------------------------------------------------------------
25
26# Relative path to the folder containing the test files.
27inputPath = "../../TestFiles/"
28outputPath = "../../TestFiles/Output/"
29
30def WriteTextToFile(outputFile, text):
31 # Write the contents of text to the disk
32 f = open(outputFile, "w")
33 try:
34 f.write(text)
35 finally:
36 f.close()
37
38def main():
39 # The first step in every application using PDFNet is to initialize the
40 # library. The library is usually initialized only once, but calling
41 # Initialize() multiple times is also fine.
42 PDFNet.Initialize(LicenseKey)
43
44 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
45
46 #-----------------------------------------------------------------------------------
47 # The following sample illustrates how to extract tables from PDF documents.
48 #-----------------------------------------------------------------------------------
49
50 # Test if the add-on is installed
51 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Tabular):
52 print("")
53 print("Unable to run Data Extraction: PDFTron SDK Tabular Data module not available.")
54 print("-----------------------------------------------------------------------------")
55 print("The Data Extraction suite is an optional add-on, available for download")
56 print("at https://docs.apryse.com/core/info/modules/. If you have already")
57 print("downloaded this module, ensure that the SDK is able to find the required files")
58 print("using the PDFNet.AddResourceSearchPath() function.")
59 print("")
60 else:
61 try:
62 # Extract tabular data as a JSON file
63 print("Extract tabular data as a JSON file")
64
65 outputFile = outputPath + "table.json"
66 DataExtractionModule.ExtractData(inputPath + "table.pdf", outputFile, DataExtractionModule.e_Tabular)
67
68 print("Result saved in " + outputFile)
69
70 #------------------------------------------------------
71 # Extract tabular data as a JSON string
72 print("Extract tabular data as a JSON string")
73
74 outputFile = outputPath + "financial.json"
75 json = DataExtractionModule.ExtractData(inputPath + "financial.pdf", DataExtractionModule.e_Tabular)
76 WriteTextToFile(outputFile, json)
77
78 print("Result saved in " + outputFile)
79
80 #------------------------------------------------------
81 # Extract tabular data as an XLSX file
82 print("Extract tabular data as an XLSX file")
83
84 outputFile = outputPath + "table.xlsx"
85 DataExtractionModule.ExtractToXLSX(inputPath + "table.pdf", outputFile)
86
87 print("Result saved in " + outputFile)
88
89 #------------------------------------------------------
90 # Extract tabular data as an XLSX stream (also known as filter)
91 print("Extract tabular data as an XLSX stream")
92
93 outputFile = outputPath + "financial.xlsx"
94 options = DataExtractionOptions()
95 options.SetPages("1") # page 1
96 outputXlsxStream = MemoryFilter(0, False)
97 DataExtractionModule.ExtractToXLSX(inputPath + "financial.pdf", outputXlsxStream, options)
98 outputXlsxStream.SetAsInputFilter()
99 outputXlsxStream.WriteToFile(outputFile, False)
100
101 print("Result saved in " + outputFile)
102 except Exception as e:
103 print("Unable to extract tabular data, error: " + str(e))
104
105 #-----------------------------------------------------------------------------------
106 # The following sample illustrates how to extract document structure from PDF documents.
107 #-----------------------------------------------------------------------------------
108
109 # Test if the add-on is installed
110 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocStructure):
111 print("")
112 print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
113 print("-----------------------------------------------------------------------------")
114 print("The Data Extraction suite is an optional add-on, available for download")
115 print("at https://docs.apryse.com/core/info/modules/. If you have already")
116 print("downloaded this module, ensure that the SDK is able to find the required files")
117 print("using the PDFNet.AddResourceSearchPath() function.")
118 print("")
119 else:
120 try:
121 # Extract document structure as a JSON file
122 print("Extract document structure as a JSON file")
123
124 outputFile = outputPath + "paragraphs_and_tables.json"
125 DataExtractionModule.ExtractData(inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule.e_DocStructure)
126
127 print("Result saved in " + outputFile)
128
129 #------------------------------------------------------
130 # Extract document structure as a JSON string
131 print("Extract document structure as a JSON string")
132
133 outputFile = outputPath + "tagged.json"
134 json = DataExtractionModule.ExtractData(inputPath + "tagged.pdf", DataExtractionModule.e_DocStructure)
135 WriteTextToFile(outputFile, json)
136
137 print("Result saved in " + outputFile)
138 except Exception as e:
139 print("Unable to extract document structure data, error: " + str(e))
140
141 #-----------------------------------------------------------------------------------
142 # The following sample illustrates how to extract form fields from PDF documents.
143 #-----------------------------------------------------------------------------------
144
145 # Test if the add-on is installed
146 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Form):
147 print("")
148 print("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
149 print("-----------------------------------------------------------------------------")
150 print("The Data Extraction suite is an optional add-on, available for download")
151 print("at https://docs.apryse.com/core/info/modules/. If you have already")
152 print("downloaded this module, ensure that the SDK is able to find the required files")
153 print("using the PDFNet.AddResourceSearchPath() function.")
154 print("")
155 else:
156 try:
157 # Extract form fields as a JSON file
158 print("Extract form fields as a JSON file")
159
160 outputFile = outputPath + "formfields-scanned.json"
161 DataExtractionModule.ExtractData(inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule.e_Form)
162
163 print("Result saved in " + outputFile)
164
165 #------------------------------------------------------
166 # Extract form fields as a JSON string
167 print("Extract form fields as a JSON string")
168
169 outputFile = outputPath + "formfields.json"
170 json = DataExtractionModule.ExtractData(inputPath + "formfields.pdf", DataExtractionModule.e_Form)
171 WriteTextToFile(outputFile, json)
172
173 print("Result saved in " + outputFile)
174
175 #-----------------------------------------------------------------------------------
176 # Detect and add form fields to a PDF document.
177 # PDF document already has form fields, and this sample will update to new found fields.
178 print("Extract form fields as a pdf file, update to new")
179
180 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
181
182 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
183
184 outputFile = outputPath + "formfields-scanned-fields-new.pdf"
185 doc.Save(outputFile, SDFDoc.e_linearized)
186 doc.Close()
187
188 print("Result saved in " + outputFile)
189
190 #-----------------------------------------------------------------------------------
191 # Detect and add form fields to a PDF document.
192 # PDF document already has form fields, and this sample will keep the original fields.
193 print("Extract form fields as a pdf file, keep original")
194
195 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
196
197 options = DataExtractionOptions()
198 options.SetOverlappingFormFieldBehavior("KeepOld")
199 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
200
201 outputFile = outputPath + "formfields-scanned-fields-old.pdf"
202 doc.Save(outputFile, SDFDoc.e_linearized)
203 doc.Close()
204
205 print("Result saved in " + outputFile)
206
207 except Exception as e:
208 print("Unable to extract form fields data, error: " + str(e))
209
210 #-----------------------------------------------------------------------------------
211
212 PDFNet.Terminate()
213 print("Done.")
214
215if __name__ == '__main__':
216 main()
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12#---------------------------------------------------------------------------------------
13# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
14# extract various types of data from PDF documents.
15#
16# The PDFTron SDK Data Extraction suite can be downloaded from
17# https://docs.apryse.com/core/info/modules/
18#
19# Please contact us if you have any questions.
20#---------------------------------------------------------------------------------------
21
22# Relative path to the folder containing the test files.
23$inputPath = "../../TestFiles/"
24$outputPath = "../../TestFiles/Output/"
25
26def main()
27 # The first step in every application using PDFNet is to initialize the
28 # library. The library is usually initialized only once, but calling
29 # Initialize() multiple times is also fine.
30 PDFNet.Initialize(PDFTronLicense.Key)
31
32 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
33
34 #-----------------------------------------------------------------------------------
35 # The following sample illustrates how to extract tables from PDF documents.
36 #-----------------------------------------------------------------------------------
37
38 # Test if the add-on is installed
39 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Tabular) then
40 puts ""
41 puts "Unable to run Data Extraction: PDFTron SDK Tabular Data module not available."
42 puts "-----------------------------------------------------------------------------"
43 puts "The Data Extraction suite is an optional add-on, available for download"
44 puts "at https://docs.apryse.com/core/info/modules/. If you have already"
45 puts "downloaded this module, ensure that the SDK is able to find the required files"
46 puts "using the PDFNet.AddResourceSearchPath() function."
47 puts ""
48 else
49 begin
50 # Extract tabular data as a JSON file
51 puts "Extract tabular data as a JSON file"
52
53 outputFile = $outputPath + "table.json"
54 DataExtractionModule.ExtractData($inputPath + "table.pdf", outputFile, DataExtractionModule::E_Tabular)
55
56 puts "Result saved in " + outputFile
57
58 #------------------------------------------------------
59 # Extract tabular data as a JSON string
60 puts "Extract tabular data as a JSON string"
61
62 outputFile = $outputPath + "financial.json"
63 json = DataExtractionModule.ExtractData($inputPath + "financial.pdf", DataExtractionModule::E_Tabular)
64 File.open(outputFile, 'w') { |file| file.write(json) }
65
66 puts "Result saved in " + outputFile
67
68 #------------------------------------------------------
69 # Extract tabular data as an XLSX file
70 puts "Extract tabular data as an XLSX file"
71
72 outputFile = $outputPath + "table.xlsx"
73 DataExtractionModule.ExtractToXLSX($inputPath + "table.pdf", outputFile)
74
75 puts "Result saved in " + outputFile
76
77 #------------------------------------------------------
78 # Extract tabular data as an XLSX stream (also known as filter)
79 puts "Extract tabular data as an XLSX stream"
80
81 outputFile = $outputPath + "financial.xlsx"
82 outputXlsxStream = MemoryFilter.new(0, false)
83 options = DataExtractionOptions.new()
84 options.SetPages("1") # page 1
85 DataExtractionModule.ExtractToXLSX($inputPath + "financial.pdf", outputXlsxStream, options)
86 outputXlsxStream.SetAsInputFilter()
87 outputXlsxStream.WriteToFile(outputFile, false)
88
89 puts "Result saved in " + outputFile
90 rescue => error
91 puts "Unable to extract tabular data, error: " + error.message
92 end
93 end
94
95 #-----------------------------------------------------------------------------------
96 # The following sample illustrates how to extract document structure from PDF documents.
97 #-----------------------------------------------------------------------------------
98
99 # Test if the add-on is installed
100 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_DocStructure) then
101 puts ""
102 puts "Unable to run Data Extraction: PDFTron SDK Structured Output module not available."
103 puts "-----------------------------------------------------------------------------"
104 puts "The Data Extraction suite is an optional add-on, available for download"
105 puts "at https://docs.apryse.com/core/info/modules/. If you have already"
106 puts "downloaded this module, ensure that the SDK is able to find the required files"
107 puts "using the PDFNet.AddResourceSearchPath() function."
108 puts ""
109 else
110 begin
111 # Extract document structure as a JSON file
112 puts "Extract document structure as a JSON file"
113
114 outputFile = $outputPath + "paragraphs_and_tables.json"
115 DataExtractionModule.ExtractData($inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule::E_DocStructure)
116
117 puts "Result saved in " + outputFile
118
119 #------------------------------------------------------
120 # Extract document structure as a JSON string
121 puts "Extract document structure as a JSON string"
122
123 outputFile = $outputPath + "tagged.json"
124 json = DataExtractionModule.ExtractData($inputPath + "tagged.pdf", DataExtractionModule::E_DocStructure)
125 File.open(outputFile, 'w') { |file| file.write(json) }
126
127 puts "Result saved in " + outputFile
128 rescue => error
129 puts "Unable to extract document structure data, error: " + error.message
130 end
131 end
132
133 #-----------------------------------------------------------------------------------
134 # The following sample illustrates how to extract form fields from PDF documents.
135 #-----------------------------------------------------------------------------------
136
137 # Test if the add-on is installed
138 if !DataExtractionModule.IsModuleAvailable(DataExtractionModule::E_Form) then
139 puts ""
140 puts "Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available."
141 puts "-----------------------------------------------------------------------------"
142 puts "The Data Extraction suite is an optional add-on, available for download"
143 puts "at https://docs.apryse.com/core/info/modules/. If you have already"
144 puts "downloaded this module, ensure that the SDK is able to find the required files"
145 puts "using the PDFNet.AddResourceSearchPath() function."
146 puts ""
147 else
148 begin
149 # Extract form fields as a JSON file
150 puts "Extract form fields as a JSON file"
151
152 outputFile = $outputPath + "formfields-scanned.json"
153 DataExtractionModule.ExtractData($inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule::E_Form)
154
155 puts "Result saved in " + outputFile
156
157 #------------------------------------------------------
158 # Extract form fields as a JSON string
159 puts "Extract form fields as a JSON string"
160
161 outputFile = $outputPath + "formfields.json"
162 json = DataExtractionModule.ExtractData($inputPath + "formfields.pdf", DataExtractionModule::E_Form)
163 File.open(outputFile, 'w') { |file| file.write(json) }
164
165 puts "Result saved in " + outputFile
166
167 #-----------------------------------------------------------------------------------
168 # Detect and add form fields to a PDF document.
169 # PDF document already has form fields, and this sample will update to the new fields.
170 puts "Extract document structure as a PDF file"
171 doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
172
173 outputFile = $outputPath + "formfields-scanned-fields-new.pdf"
174
175 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
176 doc.Save(outputFile, SDFDoc::E_linearized);
177 doc.Close
178
179 puts "Result saved in " + outputFile
180
181 #-----------------------------------------------------------------------------------
182 # Detect and add form fields to a PDF document.
183 # PDF document already has form fields, and this sample will keep the original fields.
184 puts "Extract document structure as a PDF file"
185 doc = PDFDoc.new($inputPath + "formfields-scanned-withfields.pdf")
186
187 outputFile = $outputPath + "formfields-scanned-fields-old.pdf"
188
189 options = DataExtractionOptions.new()
190 options.SetOverlappingFormFieldBehavior("KeepOld")
191 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
192 doc.Save(outputFile, SDFDoc::E_linearized);
193 doc.Close
194
195 puts "Result saved in " + outputFile
196
197
198 rescue => error
199 puts "Unable to extract form fields data, error: " + error.message
200 end
201 end
202
203 #-----------------------------------------------------------------------------------
204
205 PDFNet.Terminate
206 puts "Done."
207end
208
209main()
1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports pdftron
6Imports pdftron.Common
7Imports pdftron.PDF
8Imports pdftron.Filters
9
10' The Data Extraction suite is an optional PDFNet add-on collection that can be used to
11' extract various types of data from PDF documents.
12' The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
13
14Module DataExtractionTestVB
15 Dim pdfNetLoader As PDFNetLoader
16 Sub New()
17 pdfNetLoader = pdftron.PDFNetLoader.Instance()
18 End Sub
19
20 ' Relative path to the folder containing test files.
21 Dim input_path As String = "../../../../TestFiles/"
22 Dim output_path As String = "../../../../TestFiles/Output/"
23
24 Sub Main()
25 PDFNet.Initialize(PDFTronLicense.Key)
26 PDFNet.AddResourceSearchPath("../../../../../Lib/")
27
28 TestTabularData()
29 TestDocumentStructure()
30 TestFormFields()
31
32 PDFNet.Terminate()
33 End Sub
34
35
36 ' The following sample illustrates how to extract tables from PDF documents.
37 Sub TestTabularData()
38 ' Test if the add-on is installed
39 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular) Then
40 Console.WriteLine()
41 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
42 Console.WriteLine("---------------------------------------------------------------")
43 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
44 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
45 Console.WriteLine("module, ensure that the SDK is able to find the required files")
46 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
47 Console.WriteLine()
48 Return
49 End If
50
51 Try
52 ' Extract tabular data as a JSON file
53 DataExtractionModule.ExtractData(input_path & "table.pdf", output_path & "table.json", DataExtractionModule.DataExtractionEngine.e_tabular)
54
55 ' Extract tabular data as a JSON string
56 Dim json As String = DataExtractionModule.ExtractData(input_path & "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular)
57 System.IO.File.WriteAllText(output_path & "financial.json", json)
58
59 ' Extract tabular data as an XLSX file
60 DataExtractionModule.ExtractToXLSX(input_path & "table.pdf", output_path & "table.xlsx")
61
62 ' Extract tabular data as an XLSX stream (also known as filter)
63 Dim output_xlsx_stream As MemoryFilter = New MemoryFilter(0, False)
64 DataExtractionModule.ExtractToXLSX(input_path & "financial.pdf", output_xlsx_stream)
65 output_xlsx_stream.SetAsInputFilter()
66 output_xlsx_stream.WriteToFile(output_path & "financial.xlsx", False)
67
68 Catch e As PDFNetException
69 Console.WriteLine(e.Message)
70 End Try
71 End Sub
72
73
74 ' The following sample illustrates how to extract document structure from PDF documents.
75 Sub TestDocumentStructure()
76 ' Test if the add-on is installed
77 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure) Then
78 Console.WriteLine()
79 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.")
80 Console.WriteLine("---------------------------------------------------------------")
81 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
82 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
83 Console.WriteLine("module, ensure that the SDK is able to find the required files")
84 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
85 Console.WriteLine()
86 Return
87 End If
88
89 Try
90 ' Extract document structure as a JSON file
91 DataExtractionModule.ExtractData(input_path & "paragraphs_and_tables.pdf", output_path & "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure)
92
93 ' Extract document structure as a JSON string
94 Dim json As String = DataExtractionModule.ExtractData(input_path & "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure)
95 System.IO.File.WriteAllText(output_path & "tagged.json", json)
96
97 Catch e As PDFNetException
98 Console.WriteLine(e.Message)
99 End Try
100 End Sub
101
102
103 ' The following sample illustrates how to extract form fields from PDF documents.
104 Sub TestFormFields()
105 ' Test if the add-on is installed
106 If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form) Then
107 Console.WriteLine()
108 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.")
109 Console.WriteLine("---------------------------------------------------------------")
110 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
111 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
112 Console.WriteLine("module, ensure that the SDK is able to find the required files")
113 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
114 Console.WriteLine()
115 Return
116 End If
117
118 Try
119 ' Extract form fields as a JSON file
120 DataExtractionModule.ExtractData(input_path & "formfields-scanned.pdf", output_path & "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form)
121
122 ' Extract form fields as a JSON string
123 Dim json As String = DataExtractionModule.ExtractData(input_path & "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form)
124 System.IO.File.WriteAllText(output_path & "formfields.json", json)
125
126 ' Detect and add form fields to a PDF document.
127 ' PDF document already has form fields, and this sample will update to new found fields.
128 Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
129 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
130 doc.Save(output_path & "formfields-scanned-new.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
131 End Using
132
133 ' Detect and add form fields to a PDF document.
134 ' PDF document already has form fields, and this sample will keep the original fields.
135 Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
136 Dim options = New DataExtractionOptions()
137 options.SetOverlappingFormFieldBehavior("KeepOld")
138 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
139 doc.Save(output_path & "formfields-scanned-old.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
140 End Using
141
142 Catch e As PDFNetException
143 Console.WriteLine(e.Message)
144 End Try
145
146 End Sub
147
148End Module
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales