Smart Data Extraction - C++ Sample Code

Requirements
View Demo

Sample code shows how to use the Apryse Data Extraction module to extract tabular data, document structure and form fields from PDF documents. Sample code provided in Python, C++, C# (.Net), Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our Server SDK and Smart Data Extraction.

Implementation steps

  1. Get started with Server SDK in your language/framework
  2. Download the Data Extraction Module
  3. Add the sample code provided in this guide

To use this feature in production, your license key will need theĀ Smart Data Extraction Package. Trial keys already include all packages.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/DataExtractionModule.h>
7#include <PDF/PDFNet.h>
8#include <PDF/PDFDoc.h>
9#include <PDF/Convert.h>
10#include <Filters/MemoryFilter.h>
11#include <string>
12#include <iostream>
13#include <fstream>
14#include "../../LicenseKey/CPP/LicenseKey.h"
15
16using namespace pdftron;
17using namespace PDF;
18using namespace Filters;
19using namespace std;
20
21//---------------------------------------------------------------------------------------
22// The Data Extraction suite is an optional PDFNet add-on collection that can be used to
23// extract various types of data from PDF documents.
24//
25// The Apryse SDK Data Extraction suite can be downloaded from https://docs.apryse.com/core/guides/info/modules#data-extraction-module
26//---------------------------------------------------------------------------------------
27
28void WriteTextToFile(const std::string& filename, const UString& text)
29{
30 ofstream out_file(filename.c_str(), ofstream::binary);
31 string out_buf = text.ConvertToUtf8();
32 out_file.write(out_buf.c_str(), out_buf.size());
33 out_file.close();
34}
35
36
37string input_path("../../TestFiles/");
38string output_path("../../TestFiles/Output/");
39
40//---------------------------------------------------------------------------------------
41// The following sample illustrates how to extract tables from PDF documents.
42//---------------------------------------------------------------------------------------
43void TestTabularData()
44{
45 // Test if the add-on is installed
46 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Tabular))
47 {
48 cout << endl;
49 cout << "Unable to run Data Extraction: Apryse SDK Tabular Data module not available." << endl;
50 cout << "---------------------------------------------------------------" << endl;
51 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
52 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
53 cout << "module, ensure that the SDK is able to find the required files" << endl;
54 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
55 return;
56 }
57
58 // Extract tabular data as a JSON file
59 DataExtractionModule::ExtractData(input_path + UString("table.pdf"), output_path + UString("table.json"), DataExtractionModule::e_Tabular);
60
61 // Extract tabular data as a JSON string
62 UString json = DataExtractionModule::ExtractData(input_path + UString("financial.pdf"), DataExtractionModule::e_Tabular);
63 WriteTextToFile((output_path + "financial.json").c_str(), json);
64
65 // Extract tabular data as an XLSX file
66 DataExtractionModule::ExtractToXLSX(input_path + UString("table.pdf"), output_path + UString("table.xlsx"));
67
68 // Extract tabular data as an XLSX stream (also known as filter)
69 MemoryFilter output_xlsx_stream(0, false);
70 DataExtractionOptions options;
71 options.SetPages("1"); // extract page 1
72 DataExtractionModule::ExtractToXLSX(input_path + UString("financial.pdf"), output_xlsx_stream, &options);
73 output_xlsx_stream.SetAsInputFilter();
74 output_xlsx_stream.WriteToFile(output_path + UString("financial.xlsx"), false);
75}
76
77//---------------------------------------------------------------------------------------
78// The following sample illustrates how to extract document structure from PDF documents.
79//---------------------------------------------------------------------------------------
80void TestDocumentStructure()
81{
82 // Test if the add-on is installed
83 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_DocStructure))
84 {
85 cout << endl;
86 cout << "Unable to run Data Extraction: Apryse SDK Structured Output module not available." << endl;
87 cout << "---------------------------------------------------------------" << endl;
88 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
89 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
90 cout << "module, ensure that the SDK is able to find the required files" << endl;
91 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
92 return;
93 }
94
95 // Extract document structure as a JSON file
96 DataExtractionModule::ExtractData(input_path + UString("paragraphs_and_tables.pdf"), output_path + UString("paragraphs_and_tables.json"), DataExtractionModule::e_DocStructure);
97
98 // Extract document structure as a JSON string
99 UString json = DataExtractionModule::ExtractData(input_path + UString("tagged.pdf"), DataExtractionModule::e_DocStructure);
100 WriteTextToFile((output_path + "tagged.json").c_str(), json);
101}
102
103//---------------------------------------------------------------------------------------
104// The following sample illustrates how to extract form fields from PDF documents.
105//---------------------------------------------------------------------------------------
106void TestFormFields()
107{
108 // Test if the add-on is installed
109 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_Form))
110 {
111 cout << endl;
112 cout << "Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available." << endl;
113 cout << "---------------------------------------------------------------" << endl;
114 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
115 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
116 cout << "module, ensure that the SDK is able to find the required files" << endl;
117 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
118 return;
119 }
120
121 // Extract form fields as a JSON file
122 DataExtractionModule::ExtractData(input_path + UString("formfields-scanned.pdf"), output_path + UString("formfields-scanned.json"), DataExtractionModule::e_Form);
123
124 // Extract form fields as a JSON string
125 UString json = DataExtractionModule::ExtractData(input_path + UString("formfields.pdf"), DataExtractionModule::e_Form);
126 WriteTextToFile((output_path + "formfields.json").c_str(), json);
127
128 //---------------------------------------------------------------------------------------
129 // Detect and add form fields to a PDF document.
130 // PDF document already has form fields, and this sample will update to new found fields.
131 //---------------------------------------------------------------------------------------
132 {
133 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
134
135 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc);
136
137 // Save the modfied pdf document
138 doc.Save(output_path + "formfields-scanned-fields-new.pdf", SDF::SDFDoc::e_linearized, NULL);
139 }
140
141 //---------------------------------------------------------------------------------------
142 // Detect and add form fields to a PDF document.
143 // PDF document already has form fields, and this sample will keep the original fields.
144 //---------------------------------------------------------------------------------------
145 {
146 PDFDoc doc(input_path + "formfields-scanned-withfields.pdf");
147
148 // Setup DataExtractionOptions to keep old fields
149 DataExtractionOptions options;
150 options.SetOverlappingFormFieldBehavior("KeepOld");
151
152 DataExtractionModule::DetectAndAddFormFieldsToPDF(doc, &options);
153
154 // Save the modfied pdf document
155 doc.Save(output_path + "formfields-scanned-fields-old.pdf", SDF::SDFDoc::e_linearized, NULL);
156 }
157}
158
159//---------------------------------------------------------------------------------------
160// The following sample illustrates how to extract key-value pairs from PDF documents.
161//---------------------------------------------------------------------------------------
162void TestGenericKeyValue() {
163
164 if (!DataExtractionModule::IsModuleAvailable(DataExtractionModule::e_GenericKeyValue))
165 {
166 cout << endl;
167 cout << "Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available." << endl;
168 cout << "---------------------------------------------------------------" << endl;
169 cout << "The Data Extraction suite is an optional add-on, available for download" << endl;
170 cout << "at https://docs.apryse.com/core/guides/info/modules#data-extraction-module . If you have already downloaded this" << endl;
171 cout << "module, ensure that the SDK is able to find the required files" << endl;
172 cout << "using the PDFNet::AddResourceSearchPath() function." << endl << endl;
173 return;
174 }
175
176 // Simple example: Extract Keys & Values as a JSON file
177 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val.json"), DataExtractionModule::e_GenericKeyValue);
178
179 // Example with customized options:
180 // Extract Keys & Values from pages 2-4, excluding ads
181 DataExtractionOptions options;
182 options.SetPages("2-4");
183 RectCollection p2_exclusion_zones;
184 // Exclude the ad on page 2
185 // These coordinates are in PDF user space, with the origin at the bottom left corner of the page
186 // Coordinates rotate with the page, if it has rotation applied.
187 p2_exclusion_zones.AddRect(166, 47, 562, 222);
188 options.AddExclusionZonesForPage(p2_exclusion_zones, 2);
189
190 RectCollection p4_inclusion_zones, p4_exclusion_zones;
191 // Only include the article text for page 4, exclude ads and headings
192 p4_inclusion_zones.AddRect(30, 432, 562, 684);
193 p4_exclusion_zones.AddRect(30, 657, 295, 684);
194 options.AddInclusionZonesForPage(p4_inclusion_zones, 4);
195 options.AddExclusionZonesForPage(p4_exclusion_zones, 4);
196
197 DataExtractionModule::ExtractData(input_path + UString("newsletter.pdf"), output_path + UString("newsletter_key_val_with_zones.json"), DataExtractionModule::e_GenericKeyValue, &options);
198}
199
200int main(int argc, char* argv[])
201{
202 // The first step in every application using PDFNet is to initialize the
203 // library and set the path to common PDF resources. The library is usually
204 // initialized only once, but calling Initialize() multiple times is also fine.
205 PDFNet::Initialize(LicenseKey);
206
207 int ret = 0;
208
209 try
210 {
211 PDFNet::AddResourceSearchPath("../../../Lib/");
212
213 TestTabularData();
214 TestDocumentStructure();
215 TestFormFields();
216 TestGenericKeyValue();
217 }
218 catch (Common::Exception& e)
219 {
220 cout << e << endl;
221 ret = 1;
222 }
223 catch (...)
224 {
225 cout << "Unknown Exception" << endl;
226 ret = 1;
227 }
228
229 PDFNet::Terminate();
230
231 return ret;
232}
233

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales