Smart Data Extraction - C++ Sample Code

Requirements
View Demo

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

Looking for data extraction + WebViewer UI? Check out our Document Structure Extraction - Showcase Sample Code

Learn more about our Server SDK and Smart Data Extraction.

Implementation steps

  1. Get started with Server SDK in your language/framework
  2. Download the Data Extraction Module
  3. Add the sample code provided in this guide

To use this feature in production, your license key will need theĀ Smart Data Extraction Package. Trial keys already include all packages.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30 ofstream out_file(filename.c_str(), ofstream::binary);
31 string out_buf = text.ConvertToUtf8();
32 out_file.write(out_buf.c_str(), out_buf.size());
33 out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45 // Test if the add-on is installed
46 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47 {
48 cout << endl;
49 cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50 cout << "---------------------------------------------------------------" << endl;
51 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
53 cout << "module, ensure that the SDK is able to find the required files" << endl;
54 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55 return;
56 }
57
58 // Extract tabular data as a JSON file
59 DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61 // Extract tabular data as a JSON string
62 UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63 WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65 // Extract tabular data as an XLSX file
66 DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68 // Extract tabular data as an XLSX stream (also known as filter)
69 MemoryFilter output_xlsx_stream(0, false);
70 DataExtractionOptions options;
71 options.SetPages("1"); // extract page 1
72 DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73 output_xlsx_stream.SetAsInputFilter();
74 output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82 // Test if the add-on is installed
83 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84 {
85 cout << endl;
86 cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87 cout << "---------------------------------------------------------------" << endl;
88 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
90 cout << "module, ensure that the SDK is able to find the required files" << endl;
91 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92 return;
93 }
94
95 // Extract document structure as a JSON file
96 DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98 // Extract document structure as a JSON string
99 UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100 WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108 // Test if the add-on is installed
109 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110 {
111 cout << endl;
112 cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113 cout << "---------------------------------------------------------------" << endl;
114 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
116 cout << "module, ensure that the SDK is able to find the required files" << endl;
117 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118 return;
119 }
120
121 // Extract form fields as a JSON file
122 DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124 // Extract form fields as a JSON string
125 UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126 WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128 //---------------------------------------------------------------------------------------
129 // Detect and add form fields to a PDF document.
130 // PDF document already has form fields, and this sample will update to new found fields.
131 //---------------------------------------------------------------------------------------
132 {
133 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137 // Save the modfied pdf document
138 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139 }
140
141 //---------------------------------------------------------------------------------------
142 // Detect and add form fields to a PDF document.
143 // PDF document already has form fields, and this sample will keep the original fields.
144 //---------------------------------------------------------------------------------------
145 {
146 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148 // Setup DataExtractionOptions to keep old fields
149 DataExtractionOptions options;
150 options.SetOverlappingFormFieldBehavior("KeepOld");
151
152 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154 // Save the modfied pdf document
155 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156 }
157}
158
159//---------------------------------------------------------------------------------------
160// The following sample illustrates how to extract key-value pairs from PDF documents.
161//---------------------------------------------------------------------------------------
162void TestGenericKeyValue() {
163
164 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue))
165 {
166 cout << endl;
167 cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
168 cout << "---------------------------------------------------------------" << endl;
169 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
170 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
171 cout << "module, ensure that the SDK is able to find the required files" << endl;
172 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
173 return;
174 }
175
176 // Simple example: Extract Keys & Values as a JSON file
177 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val.json"), DataExtractionModule::e_GenericKeyValue);
178
179 // Example with customized options:
180 // Extract Keys & Values from pages 2-4, excluding ads
181 DataExtractionOptions options;
182 options.SetPages("2-4");
183 RectCollection p2_exclusion_zones;
184 // Exclude the add-on on page 2
185 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
186 // Coordinates rotate with the page, if it has rotation applied.
187 p2_exclusion_zones.AddRect(166, 47, 562, 222);
188 options.AddExclusionZonesForPage(p2_exclusion_zones, 2);
189
190 RectCollection p4_inclusion_zones, p4_exclusion_zones;
191 // Only include the article text for page 4, exclude ads and headings
192 p4_inclusion_zones.AddRect(30, 432, 562, 684);
193 p4_exclusion_zones.AddRect(30, 657, 295, 684);
194 options.AddInclusionZonesForPage(p4_inclusion_zones, 4);
195 options.AddExclusionZonesForPage(p4_exclusion_zones, 4);
196
197 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val_with_zones.json"), DataExtractionModule::e_GenericKeyValue, &options);
198}
199
200//---------------------------------------------------------------------------------------
201// The following sample illustrates how to extract document classes from PDF documents.
202//---------------------------------------------------------------------------------------
203void TestDocClassifier()
204{
205 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocClassification))
206 {
207 cout << endl;
208 cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
209 cout << "---------------------------------------------------------------" << endl;
210 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
211 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
212 cout << "module, ensure that the SDK is able to find the required files" << endl;
213 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
214 return;
215 }
216
217 // Simple example: classify pages as a JSON file
218 DataExtractionModule::ExtractData(input_path + UString("Invoice.pdf"), output_path + UString("Invoice_Classified.json"), DataExtractionModule::e_DocClassification);
219
220 // Classify pages as a JSON string
221 UString json = DataExtractionModule::ExtractData(input_path + UString("Scientific_Publication.pdf"), DataExtractionModule::e_DocClassification);
222 WriteTextToFile((output_path + "Scientific_Publication_Classified.json").c_str(), json);
223
224 // Example with customized options:
225 DataExtractionOptions options;
226 // Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
227 options.SetMinimumConfidenceThreshold(0.7);
228 DataExtractionModule::ExtractData(input_path + UString("Email.pdf"), output_path + UString("Email_Classified.json"), DataExtractionModule::e_DocClassification, &options);
229}
230
231int main(int argc, char* argv[])
232{
233 // The first step in every application using PDFNet is to initialize the
234 // library and set the path to common PDF resources. The library is usually
235 // initialized only once, but calling Initialize() multiple times is also fine.
236 PDFNet::Initialize(LicenseKey);
237
238 int ret = 0;
239
240 try
241 {
242 PDFNet::AddResourceSearchPath("../../../Lib/");
243
244 TestTabularData();
245 TestDocumentStructure();
246 TestFormFields();
247 TestGenericKeyValue();
248 TestDocClassifier();
249 }
250 catch (Common::Exception& e)
251 {
252 cout << e << endl;
253 ret = 1;
254 }
255 catch (...)
256 {
257 cout << "Unknown Exception" << endl;
258 ret = 1;
259 }
260
261 PDFNet::Terminate();
262
263 return ret;
264}
265

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales