Smart Data Extraction - PHP Sample Code

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

To run this sample, you will need to:

  1. Get started with Server SDK in your language/framework
  2. Download the Data Extraction Module

Learn more about our Server SDK.

1 <?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10//---------------------------------------------------------------------------------------
11// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
12// extract various types of data from PDF documents.
13//
14// The Apryse SDK Data Extraction suite can be downloaded from
15// https://docs.apryse.com/core/guides/info/modules
16//
17// Please contact us if you have any questions.
18//---------------------------------------------------------------------------------------
19
20function WriteTextToFile($outputFile, $text)
21{
22 $outfile = fopen($outputFile, "w");
23 fwrite($outfile, $text);
24 fclose($outfile);
25}
26
27function main()
28{
29 // Relative path to the folder containing the test files.
30 $inputPath = getcwd()."/../../TestFiles/";
31 $outputPath = $inputPath."Output/";
32
33 // The first step in every application using PDFNet is to initialize the
34 // library. The library is usually initialized only once, but calling
35 // Initialize() multiple times is also fine.
36 global $LicenseKey;
37 PDFNet::Initialize($LicenseKey);
38 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
39
40 //-----------------------------------------------------------------------------------
41
42 PDFNet::AddResourceSearchPath("../../../PDFNetC/Lib/");
43
44 //////////////////////////////////////////////////////////////////////////
45 // The following sample illustrates how to extract tables from PDF documents.
46 //////////////////////////////////////////////////////////////////////////
47
48 // Test if the add-on is installed
49 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular)) {
50 echo(nl2br("\n"));
51 echo(nl2br("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.\n"));
52 echo(nl2br("-----------------------------------------------------------------------------\n"));
53 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
54 echo(nl2br("at https://docs.apryse.com/core/guides/info/modules. If you have already\n"));
55 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
56 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
57 echo(nl2br("\n"));
58 }
59 else {
60 try {
61 // Extract tabular data as a JSON file
62 echo(nl2br("Extract tabular data as a JSON file\n"));
63
64 $outputFile = $outputPath."table.json";
65 DataExtractionModule::ExtractData($inputPath."table.pdf", $outputFile, DataExtractionModule::e_Tabular);
66
67 echo(nl2br("Result saved in " . $outputFile . "\n"));
68
69 ///////////////////////////////////////////////////////
70 // Extract tabular data as a JSON string
71 echo(nl2br("Extract tabular data as a JSON string\n"));
72
73 $outputFile = $outputPath."financial.json";
74 $json = DataExtractionModule::ExtractData($inputPath."financial.pdf", DataExtractionModule::e_Tabular);
75 WriteTextToFile($outputFile, $json);
76
77 echo(nl2br("Result saved in " . $outputFile . "\n"));
78
79 ///////////////////////////////////////////////////////
80 // Extract tabular data as an XLSX file
81 echo(nl2br("Extract tabular data as an XLSX file\n"));
82
83 $outputFile = $outputPath."table.xlsx";
84 DataExtractionModule::ExtractToXLSX($inputPath."table.pdf", $outputFile);
85
86 echo(nl2br("Result saved in " . $outputFile . "\n"));
87
88 ///////////////////////////////////////////////////////
89 // Extract tabular data as an XLSX stream (also known as filter)
90 echo(nl2br("Extract tabular data as an XLSX stream\n"));
91
92 $outputFile = $outputPath."financial.xlsx";
93 $outputXlsxStream = new MemoryFilter(0, false);
94 $options = new DataExtractionOptions();
95 $options->SetPages("1"); // page 1
96 DataExtractionModule::ExtractToXLSX($inputPath."financial.pdf", $outputXlsxStream, $options);
97 $outputXlsxStream->SetAsInputFilter();
98 $outputXlsxStream->WriteToFile($outputFile, false);
99
100 echo(nl2br("Result saved in " . $outputFile . "\n"));
101 }
102 catch(Exception $e) {
103 echo(nl2br("Unable to extract tabular data, error: " . $e->getMessage() . "\n"));
104 }
105 }
106
107 //////////////////////////////////////////////////////////////////////////
108 // The following sample illustrates how to extract document structure from PDF documents.
109 //////////////////////////////////////////////////////////////////////////
110
111 // Test if the add-on is installed
112 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure)) {
113 echo(nl2br("\n"));
114 echo(nl2br("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.\n"));
115 echo(nl2br("-----------------------------------------------------------------------------\n"));
116 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
117 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
118 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
119 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
120 echo(nl2br("\n"));
121 }
122 else {
123 try {
124 // Extract document structure as a JSON file
125 echo(nl2br("Extract document structure as a JSON file\n"));
126
127 $outputFile = $outputPath."paragraphs_and_tables.json";
128 DataExtractionModule::ExtractData($inputPath."paragraphs_and_tables.pdf", $outputFile, DataExtractionModule::e_DocStructure);
129
130 echo(nl2br("Result saved in " . $outputFile . "\n"));
131
132 ///////////////////////////////////////////////////////
133 // Extract document structure as a JSON string
134 echo(nl2br("Extract document structure as a JSON string\n"));
135
136 $outputFile = $outputPath."tagged.json";
137 $json = DataExtractionModule::ExtractData($inputPath."tagged.pdf", DataExtractionModule::e_DocStructure);
138 WriteTextToFile($outputFile, $json);
139
140 echo(nl2br("Result saved in " . $outputFile . "\n"));
141 }
142 catch(Exception $e) {
143 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
144 }
145 }
146
147 //////////////////////////////////////////////////////////////////////////
148 // The following sample illustrates how to extract form fields from PDF documents.
149 //////////////////////////////////////////////////////////////////////////
150
151 // Test if the add-on is installed
152 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form)) {
153 echo(nl2br("\n"));
154 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.\n"));
155 echo(nl2br("-----------------------------------------------------------------------------\n"));
156 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
157 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
158 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
159 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
160 echo(nl2br("\n"));
161 }
162 else {
163 try {
164 // Extract form fields as a JSON file
165 echo(nl2br("Extract form fields as a JSON file\n"));
166
167 $outputFile = $outputPath."formfields-scanned.json";
168 DataExtractionModule::ExtractData($inputPath."formfields-scanned.pdf", $outputFile, DataExtractionModule::e_Form);
169
170 echo(nl2br("Result saved in " . $outputFile . "\n"));
171
172 ///////////////////////////////////////////////////////
173 // Extract form fields as a JSON string
174 echo(nl2br("Extract form fields as a JSON string\n"));
175
176 $outputFile = $outputPath."formfields.json";
177 $json = DataExtractionModule::ExtractData($inputPath."formfields.pdf", DataExtractionModule::e_Form);
178 WriteTextToFile($outputFile, $json);
179
180 echo(nl2br("Result saved in " . $outputFile . "\n"));
181
182 ///////////////////////////////////////////////////////
183 // Detect and add form fields to a PDF document.
184 // PDF document already has form fields, and this sample will update to new found fields.
185 echo(nl2br("Extract form fields as a PDF file\n"));
186
187 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
188 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc);
189 $doc->Save($outputPath."formfields-scanned-fields-new.pdf", SDFDoc::e_linearized);
190 $doc->Close();
191
192 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-new.pdf" . "\n"));
193
194 ///////////////////////////////////////////////////////
195 // Detect and add form fields to a PDF document.
196 // PDF document already has form fields, and this sample will keep the original fields.
197 echo(nl2br("Extract form fields as a PDF file\n"));
198
199 $doc = new PDFDoc($inputPath."formfields-scanned-withfields.pdf");
200 $options = new DataExtractionOptions();
201 $options->SetOverlappingFormFieldBehavior("KeepOld");
202 DataExtractionModule::DetectAndAddFormFieldsToPDF($doc, $options);
203 $doc->Save($outputPath."formfields-scanned-fields-old.pdf", SDFDoc::e_linearized);
204 $doc->Close();
205
206 echo(nl2br("Result saved in " . $outputPath ."formfields-scanned-fields-old.pdf" . "\n"));
207
208 }
209 catch(Exception $e) {
210 echo(nl2br("Unable to extract form fields data, error: " . $e->getMessage() . "\n"));
211 }
212 }
213
214 //////////////////////////////////////////////////////////////////////////
215 // The following sample illustrates how to extract document structure from PDF documents.
216 //////////////////////////////////////////////////////////////////////////
217
218 // Test if the add-on is installed
219 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue)) {
220 echo(nl2br("\n"));
221 echo(nl2br("Unable to run Data Extraction: PDFTron SDK AIPageObjectExtractor module not available.\n"));
222 echo(nl2br("-----------------------------------------------------------------------------\n"));
223 echo(nl2br("The Data Extraction suite is an optional add-on, available for download\n"));
224 echo(nl2br("at https://docs.apryse.com/documentation/core/info/modules/. If you have already\n"));
225 echo(nl2br("downloaded this module, ensure that the SDK is able to find the required files\n"));
226 echo(nl2br("using the PDFNet::AddResourceSearchPath() function.\n"));
227 echo(nl2br("\n"));
228 }
229 else {
230 try {
231
232 echo(nl2br("Extract key-value pairs from a PDF\n"));
233 // Simple example: Extract Keys & Values as a JSON file
234 $outputFile = $outputPath."newsletter_key_val.json";
235 DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue);
236
237 echo(nl2br("Result saved in " . $outputFile . "\n"));
238
239 // Example with customized options:
240 // Extract Keys & Values from pages 2-4, excluding ads
241 $options = new DataExtractionOptions();
242 $options->setPages("2-4");
243
244 $p2ExclusionZones = new RectCollection();
245 // Exclude the ad on page 2
246 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
247 // Coordinates rotate with the page, if it has rotation applied.
248 $p2ExclusionZones->AddRect(new Rect(166.0, 47.0, 562.0, 222.0));
249 $options->AddExclusionZonesForPage($p2ExclusionZones, 2);
250
251 $p4InclusionZones = new RectCollection();
252 $p4ExclusionZones = new RectCollection();
253 // Only include the article text for page 4, exclude ads and headings
254 $p4InclusionZones->AddRect(new Rect(30.0, 432.0, 562.0, 684.0));
255 $p4ExclusionZones->AddRect(new Rect(30.0, 657.0, 295.0, 684.0));
256 $options->AddInclusionZonesForPage($p4InclusionZones, 4);
257 $options->AddExclusionZonesForPage($p4ExclusionZones, 4);
258
259 echo(nl2br("Extract Key-Value pairs from specific pages and zones as a JSON file\n"));
260 $outputFile = $outputPath."newsletter_key_val_with_zones.json";
261 DataExtractionModule::ExtractData($inputPath."newsletter.pdf", $outputFile, DataExtractionModule::e_GenericKeyValue, $options);
262
263 echo(nl2br("Result saved in " . $outputFile . "\n"));
264 }
265 catch(Exception $e) {
266 echo(nl2br("Unable to extract document structure data, error: " . $e->getMessage() . "\n"));
267 }
268 }
269
270 //-----------------------------------------------------------------------------------
271
272 PDFNet::Terminate();
273 echo(nl2br("Done.\n"));
274}
275
276main();
277?>
278

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales