Extract Text, Read, Parse PDF - TextExtract - C++ Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Element.h>
10#include <PDF/Font.h>
11#include <Filters/FilterReader.h>
12#include <PDF/Image/Image2RGB.h>
13#include <PDF/TextExtractor.h>
14
15// This sample illustrates the basic text extraction capabilities of PDFNet.
16
17#include <iostream>
18#include "../../LicenseKey/CPP/LicenseKey.h"
19
20using namespace std;
21
22using namespace pdftron;
23using namespace PDF;
24using namespace SDF;
25using namespace Common;
26using namespace Filters;
27
28// A utility method used to dump all text content in the console window.
29void DumpAllText(ElementReader& reader)
30{
31 Element element;
32 while ((element = reader.Next()) != 0)
33 {
34 switch (element.GetType())
35 {
36 case Element::e_text_begin:
37 cout << "\n--> Text Block Begin\n";
38 break;
39 case Element::e_text_end:
40 cout << "\n--> Text Block End\n";
41 break;
42 case Element::e_text:
43 {
44 Rect bbox;
45 element.GetBBox(bbox);
46 cout << "\n--> BBox: " << bbox.x1 << ", "
47 << bbox.y1 << ", "
48 << bbox.x2 << ", "
49 << bbox.y2 << "\n";
50
51 UString arr = element.GetTextString();
52 cout << arr << "\n";
53 }
54 break;
55 case Element::e_text_new_line:
56 cout << "\n--> New Line\n";
57 break;
58 case Element::e_form: // Process form XObjects
59 reader.FormBegin();
60 DumpAllText(reader);
61 reader.End();
62 break;
63 }
64 }
65}
66
67// A helper method for ReadTextFromRect
68void RectTextSearch(ElementReader& reader, const Rect& pos, UString& srch_str)
69{
70 Element element;
71 while (element = reader.Next())
72 {
73 switch (element.GetType())
74 {
75 case Element::e_text:
76 {
77 Rect bbox;
78 element.GetBBox(bbox);
79 if(bbox.IntersectRect(bbox, pos))
80 {
81 UString arr = element.GetTextString();
82 srch_str += arr;
83 srch_str += "\n"; // add a new line?
84 }
85 break;
86 }
87 case Element::e_text_new_line:
88 {
89 break;
90 }
91 case Element::e_form: // Process form XObjects
92 {
93 reader.FormBegin();
94 RectTextSearch(reader, pos, srch_str);
95 reader.End();
96 break;
97 }
98 }
99 }
100}
101
102// A utility method used to extract all text content from
103// a given selection rectangle. The rectangle coordinates are
104// expressed in PDF user/page coordinate system.
105UString ReadTextFromRect(Page& page, const Rect& pos, ElementReader& reader)
106{
107 UString srch_str;
108 reader.Begin(page);
109 RectTextSearch(reader, pos, srch_str);
110 reader.End();
111 return srch_str;
112}
113
114
115void PrintStyle(TextExtractor::Style& s)
116{
117 UInt8 rgb[3];
118 char rgb_hex[24];
119
120 s.GetColor(rgb);
121 sprintf(rgb_hex, "%02X%02X%02X;", rgb[0], rgb[1], rgb[2]);
122 cout << " style=\"font-family:" << s.GetFontName() << "; " << "font-size:" << s.GetFontSize() << ";"
123 << (s.IsSerif() ? " sans-serif; " : " ") << "color:#" << rgb_hex << "\"";
124}
125
126int main(int argc, char *argv[])
127{
128 int ret = 0;
129 PDFNet::Initialize(LicenseKey);
130 // Relative path to the folder containing test files.
131 string input_path = "../../TestFiles/newsletter.pdf";
132
133
134
135
136 const char* filein = argc>1 ? argv[1] : input_path.c_str();
137
138 bool example1_basic = false;
139 bool example2_xml = false;
140 bool example3_wordlist = false;
141 bool example4_advanced = true;
142 bool example5_low_level = false;
143
144 // Sample code showing how to use high-level text extraction APIs.
145 try
146 {
147 PDFDoc doc(filein);
148 doc.InitSecurityHandler();
149
150 Page page = doc.GetPage(1);
151 if (!page){
152 cout << "Page not found." << endl;
153 return 1;
154 }
155
156 TextExtractor txt;
157 txt.Begin(page); // Read the page.
158 // Other options you may want to consider...
159 // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
160 // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
161
162
163 // Example 1. Get all text on the page in a single string.
164 // Words will be separated with space or new line characters.
165 if (example1_basic)
166 {
167 // Get the word count.
168 cout << "Word Count: " << txt.GetWordCount() << endl;
169
170 UString text;
171 txt.GetAsText(text);
172 cout << "\n\n- GetAsText --------------------------\n" << text << endl;
173 cout << "-----------------------------------------------------------" << endl;
174 }
175
176 // Example 2. Get XML logical structure for the page.
177 if (example2_xml)
178 {
179 UString text;
180 txt.GetAsXML(text, TextExtractor::e_words_as_elements | TextExtractor::e_output_bbox | TextExtractor::e_output_style_info);
181 cout << "\n\n- GetAsXML --------------------------\n" << text << endl;
182 cout << "-----------------------------------------------------------" << endl;
183 }
184
185 // Example 3. Extract words one by one.
186 if (example3_wordlist)
187 {
188 UString text;
189 TextExtractor::Line line = txt.GetFirstLine();
190 TextExtractor::Word word;
191 for (; line.IsValid(); line=line.GetNextLine()) {
192 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord()) {
193 text.Assign(word.GetString(), word.GetStringLen());
194 cout << text << '\n';
195 }
196 }
197 cout << "-----------------------------------------------------------" << endl;
198 }
199
200 // Example 4. A more advanced text extraction example.
201 // The output is XML structure containing paragraphs, lines, words,
202 // as well as style and positioning information.
203 if (example4_advanced)
204 {
205 const double *b;
206 double q[8];
207 int cur_flow_id=-1, cur_para_id=-1;
208
209 UString uni_str;
210 TextExtractor::Line line;
211 TextExtractor::Word word;
212 TextExtractor::Style s, line_style;
213
214 cout << "<PDFText>\n";
215
216 // For each line on the page...
217 for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
218 {
219 if ( line.GetNumWords() == 0 ) {
220 continue;
221 }
222
223 if (cur_flow_id != line.GetFlowID()) {
224 if (cur_flow_id != -1) {
225 if (cur_para_id != -1) {
226 cur_para_id = -1;
227 cout << "</Para>\n";
228 }
229 cout << "</Flow>\n";
230 }
231 cur_flow_id = line.GetFlowID();
232 cout << "<Flow id=\""<< cur_flow_id << "\">\n";
233 }
234
235 if (cur_para_id != line.GetParagraphID()) {
236 if (cur_para_id != -1)
237 cout << "</Para>\n";
238 cur_para_id = line.GetParagraphID();
239 cout << "<Para id=\""<< cur_para_id << "\">\n";
240 }
241
242 b = line.GetBBox();
243 line_style = line.GetStyle();
244 printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", b[0], b[1], b[2], b[3]);
245 PrintStyle(line_style);
246 cout << " cur_num=\"" << line.GetCurrentNum() << "\"";
247 cout << ">\n";
248
249 // For each word in the line...
250 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
251 {
252 // Output the bounding box for the word.
253 word.GetBBox(q);
254 printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", q[0], q[1], q[2], q[3]);
255 cout << " cur_num=\"" << word.GetCurrentNum() << "\"";
256 int sz = word.GetStringLen();
257 if (sz == 0) continue;
258
259 // If the word style is different from the parent style, output the new style.
260 s = word.GetStyle();
261 if (s != line_style) {
262 PrintStyle(s);
263 }
264
265 uni_str.Assign(word.GetString(), sz);
266 cout << ">" << uni_str;
267 cout << "</Word>\n";
268 }
269 cout << "</Line>\n";
270 }
271
272 if (cur_flow_id != -1) {
273 if (cur_para_id != -1) {
274 cur_para_id = -1;
275 cout << "</Para>\n";
276 }
277 cout << "</Flow>\n";
278 }
279 cout << "</PDFText>\n";
280 }
281 }
282 catch(Exception& e)
283 {
284 cout << e << endl;
285 ret = 1;
286 }
287 catch(...)
288 {
289 cout << "Unknown Exception" << endl;
290 ret = 1;
291 }
292
293
294 if(example5_low_level)
295 {
296 try
297 {
298 PDFDoc doc(filein);
299 doc.InitSecurityHandler();
300
301 // Example 1. Extract all text content from the document
302
303 ElementReader reader;
304 // Read every page
305 for (PageIterator itr=doc.GetPageIterator(); itr.HasNext(); itr.Next())
306 {
307 reader.Begin(itr.Current());
308 DumpAllText(reader);
309 reader.End();
310 }
311
312 // Example 2. Extract text content based on the
313 // selection rectangle.
314 cout << "\n----------------------------------------------------";
315 cout << "\nExtract text based on the selection rectangle.";
316 cout << "\n----------------------------------------------------\n";
317
318 Page first_page = doc.GetPageIterator().Current();
319 UString s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader);
320 cout << "\nField 1: " << s1;
321
322 s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
323 cout << "\nField 2: " << s1;
324
325 s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
326 cout << "\nField 3: " << s1;
327
328 // ...
329 cout << "Done." << endl;
330 }
331 catch(Exception& e)
332 {
333 cout << e << endl;
334 ret = 1;
335 }
336 catch(...)
337 {
338 cout << "Unknown Exception" << endl;
339 ret = 1;
340 }
341 }
342 PDFNet::Terminate();
343 return ret;
344}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales