Intelligent Data Extraction

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6using System;
7
8using pdftron;
9using pdftron.Common;
10using pdftron.PDF;
11using pdftron.SDF;
12using pdftron.Filters;
13
14namespace DataExtractionTestCS
15{
16 /// <summary>
17 ///---------------------------------------------------------------------------------------
18 /// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
19 /// extract various types of data from PDF documents.
20 ///
21 /// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
22 //---------------------------------------------------------------------------------------
23 /// </summary>
24 class Class1
25 {
26 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
27 static Class1() { }
28
29 // Relative path to the folder containing test files.
30 static string input_path = "../../../../TestFiles/";
31 static string output_path = "../../../../TestFiles/Output/";
32
33
34 /// <summary>
35 /// The following sample illustrates how to extract tables from PDF documents.
36 /// </summary>
37 static void TestTabularData()
38 {
39 // Test if the add-on is installed
40 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular))
41 {
42 Console.WriteLine();
43 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.");
44 Console.WriteLine("---------------------------------------------------------------");
45 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
46 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
47 Console.WriteLine("module, ensure that the SDK is able to find the required files");
48 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
49 Console.WriteLine();
50 return;
51 }
52
53 try
54 {
55 // Extract tabular data as a JSON file
56 DataExtractionModule.ExtractData(input_path + "table.pdf", output_path + "table.json", DataExtractionModule.DataExtractionEngine.e_tabular);
57
58 // Extract tabular data as a JSON string
59 string json = DataExtractionModule.ExtractData(input_path + "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular);
60 System.IO.File.WriteAllText(output_path + "financial.json", json);
61
62 // Extract tabular data as an XLSX file
63 DataExtractionModule.ExtractToXLSX(input_path + "table.pdf", output_path + "table.xlsx");
64
65 // Extract tabular data as an XLSX stream (also known as filter)
66 MemoryFilter output_xlsx_stream = new MemoryFilter(0, false);
67 DataExtractionModule.ExtractToXLSX(input_path + "financial.pdf", output_xlsx_stream);
68 output_xlsx_stream.SetAsInputFilter();
69 output_xlsx_stream.WriteToFile(output_path + "financial.xlsx", false);
70 }
71 catch (PDFNetException e)
72 {
73 Console.WriteLine(e.Message);
74 }
75 }
76
77
78 /// <summary>
79 // The following sample illustrates how to extract document structure from PDF documents.
80 /// </summary>
81 static void TestDocumentStructure()
82 {
83 // Test if the add-on is installed
84 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure))
85 {
86 Console.WriteLine();
87 Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.");
88 Console.WriteLine("---------------------------------------------------------------");
89 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
90 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
91 Console.WriteLine("module, ensure that the SDK is able to find the required files");
92 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
93 Console.WriteLine();
94 return;
95 }
96
97 try
98 {
99 // Extract document structure as a JSON file
100 DataExtractionModule.ExtractData(input_path + "paragraphs_and_tables.pdf", output_path + "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure);
101
102 // Extract document structure as a JSON string
103 string json = DataExtractionModule.ExtractData(input_path + "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure);
104 System.IO.File.WriteAllText(output_path + "tagged.json", json);
105 }
106 catch (PDFNetException e)
107 {
108 Console.WriteLine(e.Message);
109 }
110 }
111
112
113 /// <summary>
114 // The following sample illustrates how to extract form fields from PDF documents.
115 /// </summary>
116 static void TestFormFields()
117 {
118 // Test if the add-on is installed
119 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form))
120 {
121 Console.WriteLine();
122 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.");
123 Console.WriteLine("---------------------------------------------------------------");
124 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
125 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
126 Console.WriteLine("module, ensure that the SDK is able to find the required files");
127 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
128 Console.WriteLine();
129 return;
130 }
131
132 try
133 {
134 // Extract form fields as a JSON file
135 DataExtractionModule.ExtractData(input_path + "formfields-scanned.pdf", output_path + "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form);
136
137 // Extract form fields as a JSON string
138 string json = DataExtractionModule.ExtractData(input_path + "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form);
139 System.IO.File.WriteAllText(output_path + "formfields.json", json);
140
141 // Detect and add form fields to a PDF document.
142 // PDF document already has form fields, and this sample will update to new found fields.
143 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
144 {
145 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc);
146 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDFDoc.SaveOptions.e_linearized);
147 }
148
149 // Detect and add form fields to a PDF document.
150 // PDF document already has form fields, and this sample will keep the original fields.
151 using (PDFDoc doc = new PDFDoc(input_path + "formfields-scanned-withfields.pdf"))
152 {
153 DataExtractionOptions options = new DataExtractionOptions();
154 options.SetOverlappingFormFieldBehavior("KeepOld");
155
156 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options);
157 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDFDoc.SaveOptions.e_linearized);
158 }
159 }
160 catch (PDFNetException e)
161 {
162 Console.WriteLine(e.Message);
163 }
164 }
165
166 /// <summary>
167 // The following sample illustrates how to extract document structure from PDF documents.
168 /// </summary>
169 static void TestGenericKeyValue()
170 {
171 if (!DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value))
172 {
173 Console.WriteLine();
174 Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.");
175 Console.WriteLine("---------------------------------------------------------------");
176 Console.WriteLine("The Data Extraction suite is an optional add-on, available for download");
177 Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this");
178 Console.WriteLine("module, ensure that the SDK is able to find the required files");
179 Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.");
180 Console.WriteLine();
181 return;
182 }
183
184 try
185 {
186 // Simple example: Extract Keys & Values as a JSON file
187 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value);
188
189 // Example with customized options:
190 // Extract Keys & Values from pages 2-4, excluding ads
191 DataExtractionOptions options = new DataExtractionOptions();
192 options.SetPages("2-4");
193
194 RectCollection p2ExclusionZones = new RectCollection();
195 // Exclude the ad on page 2
196 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
197 // Coordinates rotate with the page, if it has rotation applied.
198 p2ExclusionZones.AddRect(166, 47, 562, 222);
199 options.AddExclusionZonesForPage(p2ExclusionZones, 2);
200
201 RectCollection p4InclusionZones = new RectCollection();
202 RectCollection p4ExclusionZones = new RectCollection();
203 // Only include the article text for page 4, exclude ads and headings
204 p4InclusionZones.AddRect(30, 432, 562, 684);
205 p4ExclusionZones.AddRect(30, 657, 295, 684);
206 options.AddInclusionZonesForPage(p4InclusionZones, 4);
207 options.AddExclusionZonesForPage(p4ExclusionZones, 4);
208
209 DataExtractionModule.ExtractData(input_path + "newsletter.pdf", output_path + "newsletter_key_val_with_zones.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value, options);
210 }
211 catch (PDFNetException e)
212 {
213 Console.WriteLine(e.Message);
214 }
215 }
216
217
218
219 /// <summary>
220 /// The main entry point for the application.
221 /// </summary>
222 static void Main(string[] args)
223 {
224 // The first step in every application using PDFNet is to initialize the
225 // library and set the path to common PDF resources. The library is usually
226 // initialized only once, but calling Initialize() multiple times is also fine.
227 PDFNet.Initialize(PDFTronLicense.Key);
228 PDFNet.AddResourceSearchPath("../../../../../Lib/");
229
230 TestTabularData();
231 TestDocumentStructure();
232 TestFormFields();
233 TestGenericKeyValue();
234
235 PDFNet.Terminate();
236 }
237 }
238}
239

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales