Data Extraction

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7
8using pdftron;
9using pdftron.Common;
10using pdftron.PDF;
11using pdftron.SDF;
12using pdftron.Filters;
13
14namespace DataExtractionTestCS
15{
16 /// <summary>
17 ///---------------------------------------------------------------------------------------
18 /// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
19 /// extract various types of data from PDF documents.
20 ///
21 /// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
22 //---------------------------------------------------------------------------------------
23 /// </summary>
24 class Class1
25 {
26 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
27 static Class1() { }
28
29 // Relative path to the folder containing test files.
30 static string input_path = "../../../../TestFiles/";
31 static string output_path = "../../../../TestFiles/Output/";
32
33
34 /// <summary>
35 /// The following sample illustrates how to extract tables from PDF documents.
36 /// </summary>
37 static void TestTabularData()
38 {
39 // Test if the add-on is installed
40 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
41 {
42 Console.WriteLine();
43 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
46 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine();
50 return;
51 }
52
53 try
54 {
55 // Extract tabular data as a JSON file
56 DataExtractionModule.ExtractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
57
58 // Extract tabular data as a JSON string
59 string json = DataExtractionModule.ExtractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
60 System.IO.File.WriteAllText(output_path + "financial.json", json);
61
62 // Extract tabular data as an XLSX file
63 DataExtractionModule.ExtractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
64
65 // Extract tabular data as an XLSX stream (also known as filter)
66 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
67 DataExtractionModule.ExtractToXLSX(input_path + "financial.pdf", output_xlsx_stream);
68 output_xlsx_stream.SetAsInputFilter();
69 output_xlsx_stream.WriteToFile(output_path + "financial.xlsx", false);
70 }
71 catch (PDFNetException e)
72 {
73 Console.WriteLine(e.Message);
74 }
75 }
76
77
78 /// <summary>
79 // The following sample illustrates how to extract document structure from PDF documents.
80 /// </summary>
81 static void TestDocumentStructure()
82 {
83 // Test if the add-on is installed
84 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
85 {
86 Console.WriteLine();
87 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
88 Console.WriteLine("---------------------------------------------------------------");
89 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
90 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
91 Console.WriteLine("module, ensure that the SDK is able to find the required files");
92 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
93 Console.WriteLine();
94 return;
95 }
96
97 try
98 {
99 // Extract document structure as a JSON file
100 DataExtractionModule.ExtractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
101
102 // Extract document structure as a JSON string
103 string json = DataExtractionModule.ExtractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
104 System.IO.File.WriteAllText(output_path + "tagged.json", json);
105 }
106 catch (PDFNetException e)
107 {
108 Console.WriteLine(e.Message);
109 }
110 }
111
112
113 /// <summary>
114 // The following sample illustrates how to extract form fields from PDF documents.
115 /// </summary>
116 static void TestFormFields()
117 {
118 // Test if the add-on is installed
119 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
120 {
121 Console.WriteLine();
122 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
123 Console.WriteLine("---------------------------------------------------------------");
124 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
125 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
126 Console.WriteLine("module, ensure that the SDK is able to find the required files");
127 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
128 Console.WriteLine();
129 return;
130 }
131
132 try
133 {
134 // Extract form fields as a JSON file
135 DataExtractionModule.ExtractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
136
137 // Extract form fields as a JSON string
138 string json = DataExtractionModule.ExtractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
139 System.IO.File.WriteAllText(output_path + "formfields.json", json);
140
141 // Detect and add form fields to a PDF document.
142 // PDF document already has form fields, and this sample will update to new found fields.
143 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
144 {
145 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc);
146 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveOptions.e_linearized);
147 }
148
149 // Detect and add form fields to a PDF document.
150 // PDF document already has form fields, and this sample will keep the original fields.
151 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
152 {
153 DataExtractionOptions options = new DataExtractionOptions();
154 options.SetOverlappingFormFieldBehavior("KeepOld");
155
156 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options);
157 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveOptions.e_linearized);
158 }
159 }
160 catch (PDFNetException e)
161 {
162 Console.WriteLine(e.Message);
163 }
164 }
165
166 /// <summary>
167 // The following sample illustrates how to extract document structure from PDF documents.
168 /// </summary>
169 static void TestGenericKeyValue()
170 {
171 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value))
172 {
173 Console.WriteLine();
174 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
175 Console.WriteLine("---------------------------------------------------------------");
176 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
177 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
178 Console.WriteLine("module, ensure that the SDK is able to find the required files");
179 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
180 Console.WriteLine();
181 return;
182 }
183
184 try
185 {
186 // Simple example: Extract Keys & Values as a JSON file
187 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
188
189 // Example with customized options:
190 // Extract Keys & Values from pages 2-4, excluding ads
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.SetPages("2-4");
193
194 RectCollection p2ExclusionZones = new RectCollection();
195 // Exclude the ad on page 2
196 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
197 // Coordinates rotate with the page, if it has rotation applied.
198 p2ExclusionZones.AddRect(166, 47, 562, 222);
199 options.AddExclusionZonesForPage(p2ExclusionZones, 2);
200
201 RectCollection p4InclusionZones = new RectCollection();
202 RectCollection p4ExclusionZones = new RectCollection();
203 // Only include the article text for page 4, exclude ads and headings
204 p4InclusionZones.AddRect(30, 432, 562, 684);
205 p4ExclusionZones.AddRect(30, 657, 295, 684);
206 options.AddInclusionZonesForPage(p4InclusionZones, 4);
207 options.AddExclusionZonesForPage(p4ExclusionZones, 4);
208
209 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
210 }
211 catch (PDFNetException e)
212 {
213 Console.WriteLine(e.Message);
214 }
215 }
216
217
218
219 /// <summary>
220 /// The main entry point for the application.
221 /// </summary>
222 static void Main(string[] args)
223 {
224 // The first step in every application using PDFNet is to initialize the
225 // library and set the path to common PDF resources. The library is usually
226 // initialized only once, but calling Initialize() multiple times is also fine.
227 PDFNet.Initialize(PDFTronLicense.Key);
228 PDFNet.AddResourceSearchPath("../../../../../Lib/");
229
230 TestTabularData();
231 TestDocumentStructure();
232 TestFormFields();
233 TestGenericKeyValue();
234
235 PDFNet.Terminate();
236 }
237 }
238}
239

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales