Smart Data Extraction - Java Sample Code

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

To run this sample, you will need to:

  1. Get started with Server SDK in your language/framework
  2. Download the Data Extraction Module

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.io.FileWriter;
7import java.io.BufferedWriter;
8import java.io.FileNotFoundException;
9import java.io.IOException;
10
11import com.pdftron.common.PDFNetException;
12import com.pdftron.pdf.*;
13import com.pdftron.filters.*;
14import com.pdftron.sdf.SDFDoc;
15
16//---------------------------------------------------------------------------------------
17// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18// extract various types of data from PDF documents.
19//
20// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
21//---------------------------------------------------------------------------------------
22
23public class DataExtractionTest {
24
25 static void writeTextToFile(String filename, String text) throws IOException
26 {
27 BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
28 writer.write(text);
29 writer.close();
30 }
31
32 //---------------------------------------------------------------------------------------
33 // The following sample illustrates how to extract tables from PDF documents.
34 //---------------------------------------------------------------------------------------
35 static void testTabularData()
36 {
37 try {
38 // Test if the add-on is installed
39 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
40 {
41 System.out.println();
42 System.out.println("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
43 System.out.println("---------------------------------------------------------------");
44 System.out.println("The Data Extraction suite is an optional add-on, available for download");
45 System.out.println("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this");
46 System.out.println("module, ensure that the SDK is able to find the required files");
47 System.out.println("using the PDFNet.addResourceSearchPath() function." );
48 System.out.println();
49 return;
50 }
51 } catch (PDFNetException e) {
52 System.out.println("Data Extraction module not available, error:");
53 e.printStackTrace();
54 System.out.println(e);
55 }
56
57 // Relative path to the folder containing test files.
58 String input_path = "../../TestFiles/";
59 String output_path = "../../TestFiles/Output/";
60
61 try {
62 // Extract tabular data as a JSON file
63 DataExtractionModule.extractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
64
65 // Extract tabular data as a JSON string
66 String json = DataExtractionModule.extractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
67 writeTextToFile(output_path + "financial.json", json);
68
69 // Extract tabular data as an XLSX file
70 DataExtractionModule.extractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
71
72 // Extract tabular data as an XLSX stream (also known as filter)
73 DataExtractionOptions options = new DataExtractionOptions();
74 options.setPages("1");
75 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
76 DataExtractionModule.extractToXLSX(input_path + "financial.pdf", output_xlsx_stream, options);
77 output_xlsx_stream.setAsInputFilter();
78 output_xlsx_stream.writeToFile(output_path + "financial.xlsx", false);
79
80 } catch (PDFNetException e) {
81 System.out.println(e);
82 }
83 catch (IOException e) {
84 System.out.println(e);
85 }
86 }
87
88 //---------------------------------------------------------------------------------------
89 // The following sample illustrates how to extract document structure from PDF documents.
90 //---------------------------------------------------------------------------------------
91 static void testDocumentStructure()
92 {
93 // Test if the add-on is installed
94 try {
95 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
96 {
97 System.out.println();
98 System.out.println("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
99 System.out.println("---------------------------------------------------------------");
100 System.out.println("The Data Extraction suite is an optional add-on, available for download");
101 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
102 System.out.println("module, ensure that the SDK is able to find the required files");
103 System.out.println("using the PDFNet.addResourceSearchPath() function." );
104 System.out.println();
105 return;
106 }
107 } catch (PDFNetException e) {
108 System.out.println("Data Extraction module not available, error:");
109 e.printStackTrace();
110 System.out.println(e);
111 }
112
113 // Relative path to the folder containing test files.
114 String input_path = "../../TestFiles/";
115 String output_path = "../../TestFiles/Output/";
116
117 try {
118 // Extract document structure as a JSON file
119 DataExtractionModule.extractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
120
121 // Extract document structure as a JSON string
122 String json = DataExtractionModule.extractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
123 writeTextToFile(output_path + "tagged.json", json);
124
125 } catch (PDFNetException e) {
126 System.out.println(e);
127 }
128 catch (IOException e) {
129 System.out.println(e);
130 }
131 }
132
133 //---------------------------------------------------------------------------------------
134 // The following sample illustrates how to extract form fields from PDF documents.
135 //---------------------------------------------------------------------------------------
136 static void testFormFields()
137 {
138 try {
139 // Test if the add-on is installed
140 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
141 {
142 System.out.println();
143 System.out.println("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
144 System.out.println("---------------------------------------------------------------");
145 System.out.println("The Data Extraction suite is an optional add-on, available for download");
146 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
147 System.out.println("module, ensure that the SDK is able to find the required files");
148 System.out.println("using the PDFNet.addResourceSearchPath() function." );
149 System.out.println();
150 return;
151 }
152 } catch (PDFNetException e) {
153 System.out.println("Data Extraction module not available, error:");
154 e.printStackTrace();
155 System.out.println(e);
156 }
157
158 // Relative path to the folder containing test files.
159 String input_path = "../../TestFiles/";
160 String output_path = "../../TestFiles/Output/";
161
162 try {
163 // Extract form fields as a JSON file
164 DataExtractionModule.extractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
165
166 // Extract form fields as a JSON string
167 String json = DataExtractionModule.extractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
168 writeTextToFile(output_path + "formfields.json", json);
169
170 //---------------------------------------------------------------------------------------
171 // Detect and add form fields to a PDF document.
172 // PDF document already has form fields, and this sample will update to new found fields.
173 //---------------------------------------------------------------------------------------
174 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
175 {
176 DataExtractionModule.detectAndAddFormFieldsToPDF(doc);
177
178 // Save the modfied pdf document
179 doc.save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveMode.LINEARIZED, null);
180 } catch (Exception e) {
181 e.printStackTrace();
182 }
183
184 //---------------------------------------------------------------------------------------
185 // Detect and add form fields to a PDF document.
186 // PDF document already has form fields, and this sample will keep the original fields.
187 //---------------------------------------------------------------------------------------
188 try (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
189 {
190 // Setup DataExtractionOptions to keep old fields
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.setOverlappingFormFieldBehavior("KeepOld");
193
194 DataExtractionModule.detectAndAddFormFieldsToPDF(doc, options);
195
196 // Save the modfied pdf document
197 doc.save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveMode.LINEARIZED, null);
198 } catch (Exception e) {
199 e.printStackTrace();
200 }
201
202 } catch (PDFNetException e) {
203 System.out.println(e);
204 }
205 catch (IOException e) {
206 System.out.println(e);
207 }
208 }
209
210 //---------------------------------------------------------------------------------------
211 // The following sample illustrates how to extract key-value pairs from PDF documents.
212 //---------------------------------------------------------------------------------------
213 public static void testGenericKeyValue() {
214 try {
215 // Test if the add-on is installed
216 if (!DataExtractionModule.isModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
217 {
218 System.out.println();
219 System.out.println("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
220 System.out.println("---------------------------------------------------------------");
221 System.out.println("The Data Extraction suite is an optional add-on, available for download");
222 System.out.println("at http://www.pdftron.com/. If you have already downloaded this");
223 System.out.println("module, ensure that the SDK is able to find the required files");
224 System.out.println("using the PDFNet.addResourceSearchPath() function." );
225 System.out.println();
226 return;
227 }
228 } catch (PDFNetException e) {
229 System.out.println("Data Extraction module not available, error:");
230 e.printStackTrace();
231 System.out.println(e);
232 }
233
234 // Relative path to the folder containing test files.
235 String input_path = "../../TestFiles/";
236 String output_path = "../../TestFiles/Output/";
237
238 try {
239
240 // Simple example: Extract Keys & Values as a JSON file
241 DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
242
243 // Example with customized options:
244 // Extract Keys & Values from pages 2-4, excluding ads
245 DataExtractionOptions options = new DataExtractionOptions();
246 options.setPages("2-4");
247
248 RectCollection p2ExclusionZones = new RectCollection();
249 // Exclude the ad on page 2
250 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
251 // Coordinates rotate with the page, if it has rotation applied.
252 p2ExclusionZones.addRect(166, 47, 562, 222);
253 options.addExclusionZonesForPage(p2ExclusionZones, 2);
254
255 RectCollection p4InclusionZones = new RectCollection();
256 RectCollection p4ExclusionZones = new RectCollection();
257 // Only include the article text for page 4, exclude ads and headings
258 p4InclusionZones.addRect(30, 432, 562, 684);
259 p4ExclusionZones.addRect(30, 657, 295, 684);
260 options.addInclusionZonesForPage(p4InclusionZones, 4);
261 options.addExclusionZonesForPage(p4ExclusionZones, 4);
262
263 DataExtractionModule.extractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
264
265 } catch (Exception e) {
266 System.out.println(e);
267 }
268 }
269
270 public static void main(String[] args)
271 {
272 // The first step in every application using PDFNet is to initialize the
273 // library and set the path to common PDF resources. The library is usually
274 // initialized only once, but calling initialize() multiple times is also fine.
275 PDFNet.initialize(PDFTronLicense.Key());
276 PDFNet.addResourceSearchPath("../../../Lib/");
277
278 testTabularData();
279 testDocumentStructure();
280 testFormFields();
281 testGenericKeyValue();
282
283 PDFNet.terminate();
284 }
285}
286

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales