Smart Data Extraction - Python Sample Code

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

To run this sample, you will need to:

  1. Get started with Server SDK in your language/framework
  2. Download the Data Extraction Module

Learn more about our Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11import platform
12
13sys.path.append("../../LicenseKey/PYTHON")
14from LicenseKey import *
15
16#---------------------------------------------------------------------------------------
17# The Data Extraction suite is an optional PDFNet add-on collection that can be used to
18# extract various types of data from PDF documents.
19#
20# The Apryse SDK Data Extraction suite can be downloaded from
21# https://docs.apryse.com/core/guides/info/modules#data-extraction-module
22#
23# Please contact us if you have any questions.
24#---------------------------------------------------------------------------------------
25
26# Relative path to the folder containing the test files.
27inputPath = "../../TestFiles/"
28outputPath = "../../TestFiles/Output/"
29
30def WriteTextToFile(outputFile, text):
31 # Write the contents of text to the disk
32 f = open(outputFile, "w")
33 try:
34 f.write(text)
35 finally:
36 f.close()
37
38def main():
39 # The first step in every application using PDFNet is to initialize the
40 # library. The library is usually initialized only once, but calling
41 # Initialize() multiple times is also fine.
42 PDFNet.Initialize(LicenseKey)
43
44 PDFNet.AddResourceSearchPath("../../../PDFNetC/Lib/")
45
46 #-----------------------------------------------------------------------------------
47 # The following sample illustrates how to extract tables from PDF documents.
48 #-----------------------------------------------------------------------------------
49
50 # Test if the add-on is installed
51 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Tabular):
52 print("")
53 print("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
54 print("-----------------------------------------------------------------------------")
55 print("The Data Extraction suite is an optional add-on, available for download")
56 print("at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already")
57 print("downloaded this module, ensure that the SDK is able to find the required files")
58 print("using the PDFNet.AddResourceSearchPath() function.")
59 print("")
60 else:
61 try:
62 # Extract tabular data as a JSON file
63 print("Extract tabular data as a JSON file")
64
65 outputFile = outputPath + "table.json"
66 DataExtractionModule.ExtractData(inputPath + "table.pdf", outputFile, DataExtractionModule.e_Tabular)
67
68 print("Result saved in " + outputFile)
69
70 #------------------------------------------------------
71 # Extract tabular data as a JSON string
72 print("Extract tabular data as a JSON string")
73
74 outputFile = outputPath + "financial.json"
75 json = DataExtractionModule.ExtractData(inputPath + "financial.pdf", DataExtractionModule.e_Tabular)
76 WriteTextToFile(outputFile, json)
77
78 print("Result saved in " + outputFile)
79
80 #------------------------------------------------------
81 # Extract tabular data as an XLSX file
82 print("Extract tabular data as an XLSX file")
83
84 outputFile = outputPath + "table.xlsx"
85 DataExtractionModule.ExtractToXLSX(inputPath + "table.pdf", outputFile)
86
87 print("Result saved in " + outputFile)
88
89 #------------------------------------------------------
90 # Extract tabular data as an XLSX stream (also known as filter)
91 print("Extract tabular data as an XLSX stream")
92
93 outputFile = outputPath + "financial.xlsx"
94 options = DataExtractionOptions()
95 options.SetPages("1") # page 1
96 outputXlsxStream = MemoryFilter(0, False)
97 DataExtractionModule.ExtractToXLSX(inputPath + "financial.pdf", outputXlsxStream, options)
98 outputXlsxStream.SetAsInputFilter()
99 outputXlsxStream.WriteToFile(outputFile, False)
100
101 print("Result saved in " + outputFile)
102 except Exception as e:
103 print("Unable to extract tabular data, error: " + str(e))
104
105 #-----------------------------------------------------------------------------------
106 # The following sample illustrates how to extract document structure from PDF documents.
107 #-----------------------------------------------------------------------------------
108
109 # Test if the add-on is installed
110 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocStructure):
111 print("")
112 print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
113 print("-----------------------------------------------------------------------------")
114 print("The Data Extraction suite is an optional add-on, available for download")
115 print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
116 print("downloaded this module, ensure that the SDK is able to find the required files")
117 print("using the PDFNet.AddResourceSearchPath() function.")
118 print("")
119 else:
120 try:
121 # Extract document structure as a JSON file
122 print("Extract document structure as a JSON file")
123
124 outputFile = outputPath + "paragraphs_and_tables.json"
125 DataExtractionModule.ExtractData(inputPath + "paragraphs_and_tables.pdf", outputFile, DataExtractionModule.e_DocStructure)
126
127 print("Result saved in " + outputFile)
128
129 #------------------------------------------------------
130 # Extract document structure as a JSON string
131 print("Extract document structure as a JSON string")
132
133 outputFile = outputPath + "tagged.json"
134 json = DataExtractionModule.ExtractData(inputPath + "tagged.pdf", DataExtractionModule.e_DocStructure)
135 WriteTextToFile(outputFile, json)
136
137 print("Result saved in " + outputFile)
138 except Exception as e:
139 print("Unable to extract document structure data, error: " + str(e))
140
141 #-----------------------------------------------------------------------------------
142 # The following sample illustrates how to extract form fields from PDF documents.
143 #-----------------------------------------------------------------------------------
144
145 # Test if the add-on is installed
146 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_Form):
147 print("")
148 print("Unable to run Data Extraction: PDFTron SDK AIFormFieldExtractor module not available.")
149 print("-----------------------------------------------------------------------------")
150 print("The Data Extraction suite is an optional add-on, available for download")
151 print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
152 print("downloaded this module, ensure that the SDK is able to find the required files")
153 print("using the PDFNet.AddResourceSearchPath() function.")
154 print("")
155 else:
156 try:
157 # Extract form fields as a JSON file
158 print("Extract form fields as a JSON file")
159
160 outputFile = outputPath + "formfields-scanned.json"
161 DataExtractionModule.ExtractData(inputPath + "formfields-scanned.pdf", outputFile, DataExtractionModule.e_Form)
162
163 print("Result saved in " + outputFile)
164
165 #------------------------------------------------------
166 # Extract form fields as a JSON string
167 print("Extract form fields as a JSON string")
168
169 outputFile = outputPath + "formfields.json"
170 json = DataExtractionModule.ExtractData(inputPath + "formfields.pdf", DataExtractionModule.e_Form)
171 WriteTextToFile(outputFile, json)
172
173 print("Result saved in " + outputFile)
174
175 #-----------------------------------------------------------------------------------
176 # Detect and add form fields to a PDF document.
177 # PDF document already has form fields, and this sample will update to new found fields.
178 print("Extract form fields as a pdf file, update to new")
179
180 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
181
182 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
183
184 outputFile = outputPath + "formfields-scanned-fields-new.pdf"
185 doc.Save(outputFile, SDFDoc.e_linearized)
186 doc.Close()
187
188 print("Result saved in " + outputFile)
189
190 #-----------------------------------------------------------------------------------
191 # Detect and add form fields to a PDF document.
192 # PDF document already has form fields, and this sample will keep the original fields.
193 print("Extract form fields as a pdf file, keep original")
194
195 doc = PDFDoc(inputPath + "formfields-scanned-withfields.pdf")
196
197 options = DataExtractionOptions()
198 options.SetOverlappingFormFieldBehavior("KeepOld")
199 DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
200
201 outputFile = outputPath + "formfields-scanned-fields-old.pdf"
202 doc.Save(outputFile, SDFDoc.e_linearized)
203 doc.Close()
204
205 print("Result saved in " + outputFile)
206
207 except Exception as e:
208 print("Unable to extract form fields data, error: " + str(e))
209
210 #---------------------------------------------------------------------------------------
211 # The following sample illustrates how to extract key-value pairs from PDF documents.
212 #---------------------------------------------------------------------------------------
213 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_GenericKeyValue):
214 print()
215 print("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
216 print("---------------------------------------------------------------")
217 print("The Data Extraction suite is an optional add-on, available for download")
218 print("at http://www.pdftron.com/. If you have already downloaded this")
219 print("module, ensure that the SDK is able to find the required files")
220 print("using the PDFNet.AddResourceSearchPath() function.")
221 print()
222 else:
223 try:
224 print("Extract key-value pairs from a PDF")
225 # Simple example: Extract Keys & Values as a JSON file
226 DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val.json", DataExtractionModule.e_GenericKeyValue)
227 print("Result saved in " + outputPath + "newsletter_key_val.json")
228
229 # Example with customized options:
230 # Extract Keys & Values from pages 2-4, excluding ads
231 options = DataExtractionOptions()
232 options.SetPages("2-4")
233
234 p2_exclusion_zones = RectCollection()
235 # Exclude the ad on page 2
236 # These coordinates are in PDF user space, with the origin at the bottom left corner of the page
237 # Coordinates rotate with the page, if it has rotation applied.
238 p2_exclusion_zones.AddRect(Rect(166, 47, 562, 222))
239 options.AddExclusionZonesForPage(p2_exclusion_zones, 2)
240
241 p4_inclusion_zones = RectCollection()
242 p4_exclusion_zones = RectCollection()
243 # Only include the article text for page 4, exclude ads and headings
244 p4_inclusion_zones.AddRect(Rect(30, 432, 562, 684))
245 p4_exclusion_zones.AddRect(Rect(30, 657, 295, 684))
246 options.AddInclusionZonesForPage(p4_inclusion_zones, 4)
247 options.AddExclusionZonesForPage(p4_exclusion_zones, 4)
248 print("Extract Key-Value pairs from specific pages and zones as a JSON file")
249 DataExtractionModule.ExtractData(inputPath + "newsletter.pdf", outputPath + "newsletter_key_val_with_zones.json", DataExtractionModule.e_GenericKeyValue, options)
250 print("Result saved in " + outputPath + "newsletter_key_val_with_zones.json")
251 except Exception as e:
252 print("Unable to extract key-value data, error: " + str(e))
253
254
255 PDFNet.Terminate()
256 print("Done.")
257
258if __name__ == '__main__':
259 main()
260

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales