Smart Data Extraction - C++ Sample Code

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

To run this sample, you will need to:

  1. Get started with Server SDK in your language/framework
  2. Download the Data Extraction Module

Learn more about our Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30 ofstream out_file(filename.c_str(), ofstream::binary);
31 string out_buf = text.ConvertToUtf8();
32 out_file.write(out_buf.c_str(), out_buf.size());
33 out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45 // Test if the add-on is installed
46 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47 {
48 cout << endl;
49 cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50 cout << "---------------------------------------------------------------" << endl;
51 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
53 cout << "module, ensure that the SDK is able to find the required files" << endl;
54 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55 return;
56 }
57
58 // Extract tabular data as a JSON file
59 DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61 // Extract tabular data as a JSON string
62 UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63 WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65 // Extract tabular data as an XLSX file
66 DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68 // Extract tabular data as an XLSX stream (also known as filter)
69 MemoryFilter output_xlsx_stream(0, false);
70 DataExtractionOptions options;
71 options.SetPages("1"); // extract page 1
72 DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73 output_xlsx_stream.SetAsInputFilter();
74 output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82 // Test if the add-on is installed
83 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84 {
85 cout << endl;
86 cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87 cout << "---------------------------------------------------------------" << endl;
88 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
90 cout << "module, ensure that the SDK is able to find the required files" << endl;
91 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92 return;
93 }
94
95 // Extract document structure as a JSON file
96 DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98 // Extract document structure as a JSON string
99 UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100 WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108 // Test if the add-on is installed
109 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110 {
111 cout << endl;
112 cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113 cout << "---------------------------------------------------------------" << endl;
114 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
116 cout << "module, ensure that the SDK is able to find the required files" << endl;
117 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118 return;
119 }
120
121 // Extract form fields as a JSON file
122 DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124 // Extract form fields as a JSON string
125 UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126 WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128 //---------------------------------------------------------------------------------------
129 // Detect and add form fields to a PDF document.
130 // PDF document already has form fields, and this sample will update to new found fields.
131 //---------------------------------------------------------------------------------------
132 {
133 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137 // Save the modfied pdf document
138 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139 }
140
141 //---------------------------------------------------------------------------------------
142 // Detect and add form fields to a PDF document.
143 // PDF document already has form fields, and this sample will keep the original fields.
144 //---------------------------------------------------------------------------------------
145 {
146 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148 // Setup DataExtractionOptions to keep old fields
149 DataExtractionOptions options;
150 options.SetOverlappingFormFieldBehavior("KeepOld");
151
152 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154 // Save the modfied pdf document
155 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156 }
157}
158
159//---------------------------------------------------------------------------------------
160// The following sample illustrates how to extract key-value pairs from PDF documents.
161//---------------------------------------------------------------------------------------
162void TestGenericKeyValue() {
163
164 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue))
165 {
166 cout << endl;
167 cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
168 cout << "---------------------------------------------------------------" << endl;
169 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
170 cout << "at http://www.pdftron.com/. If you have already downloaded this" << endl;
171 cout << "module, ensure that the SDK is able to find the required files" << endl;
172 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
173 return;
174 }
175
176 // Simple example: Extract Keys & Values as a JSON file
177 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val.json"), DataExtractionModule::e_GenericKeyValue);
178
179 // Example with customized options:
180 // Extract Keys & Values from pages 2-4, excluding ads
181 DataExtractionOptions options;
182 options.SetPages("2-4");
183 RectCollection p2_exclusion_zones;
184 // Exclude the ad on page 2
185 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
186 // Coordinates rotate with the page, if it has rotation applied.
187 p2_exclusion_zones.AddRect(166, 47, 562, 222);
188 options.AddExclusionZonesForPage(p2_exclusion_zones, 2);
189
190 RectCollection p4_inclusion_zones, p4_exclusion_zones;
191 // Only include the article text for page 4, exclude ads and headings
192 p4_inclusion_zones.AddRect(30, 432, 562, 684);
193 p4_exclusion_zones.AddRect(30, 657, 295, 684);
194 options.AddInclusionZonesForPage(p4_inclusion_zones, 4);
195 options.AddExclusionZonesForPage(p4_exclusion_zones, 4);
196
197 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val_with_zones.json"), DataExtractionModule::e_GenericKeyValue, &options);
198}
199
200int main(int argc, char* argv[])
201{
202 // The first step in every application using PDFNet is to initialize the
203 // library and set the path to common PDF resources. The library is usually
204 // initialized only once, but calling Initialize() multiple times is also fine.
205 PDFNet::Initialize(LicenseKey);
206
207 int ret = 0;
208
209 try
210 {
211 PDFNet::AddResourceSearchPath("../../../Lib/");
212
213 TestTabularData();
214 TestDocumentStructure();
215 TestFormFields();
216 TestGenericKeyValue();
217 }
218 catch (Common::Exception& e)
219 {
220 cout << e << endl;
221 ret = 1;
222 }
223 catch (...)
224 {
225 cout << "Unknown Exception" << endl;
226 ret = 1;
227 }
228
229 PDFNet::Terminate();
230
231 return ret;
232}
233

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales