Extract Text, Read, Parse PDF - TextExtract - C++ Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Element.h>
10#include <PDF/Font.h>
11#include <Filters/FilterReader.h>
12#include <PDF/Image/Image2RGB.h>
13#include <PDF/TextExtractor.h>
14
15// This sample illustrates the basic text extraction capabilities of PDFNet.
16
17#include <iostream>
18#include "../../LicenseKey/CPP/LicenseKey.h"
19
20using namespace std;
21
22using namespace pdftron;
23using namespace PDF;
24using namespace SDF;
25using namespace Common;
26using namespace Filters;
27
28// A utility method used to dump all text content in the console window.
29void DumpAllText(ElementReader& reader)
30{
31 Element element;
32 while ((element = reader.Next()) != 0)
33 {
34 switch (element.GetType())
35 {
36 case Element::e_text_begin:
37 cout << "\n--> Text Block Begin\n";
38 break;
39 case Element::e_text_end:
40 cout << "\n--> Text Block End\n";
41 break;
42 case Element::e_text:
43 {
44 Rect bbox;
45 element.GetBBox(bbox);
46 cout << "\n--> BBox: " << bbox.x1 << ", "
47 << bbox.y1 << ", "
48 << bbox.x2 << ", "
49 << bbox.y2 << "\n";
50
51 UString arr = element.GetTextString();
52 cout << arr << "\n";
53 }
54 break;
55 case Element::e_text_new_line:
56 cout << "\n--> New Line\n";
57 break;
58 case Element::e_form: // Process form XObjects
59 reader.FormBegin();
60 DumpAllText(reader);
61 reader.End();
62 break;
63 }
64 }
65}
66
67// A helper method for ReadTextFromRect
68void RectTextSearch(ElementReader& reader, const Rect& pos, UString& srch_str)
69{
70 Element element;
71 while (element = reader.Next())
72 {
73 switch (element.GetType())
74 {
75 case Element::e_text:
76 {
77 Rect bbox;
78 element.GetBBox(bbox);
79 if(bbox.IntersectRect(bbox, pos))
80 {
81 UString arr = element.GetTextString();
82 srch_str += arr;
83 srch_str += "\n"; // add a new line?
84 }
85 break;
86 }
87 case Element::e_text_new_line:
88 {
89 break;
90 }
91 case Element::e_form: // Process form XObjects
92 {
93 reader.FormBegin();
94 RectTextSearch(reader, pos, srch_str);
95 reader.End();
96 break;
97 }
98 }
99 }
100}
101
102// A utility method used to extract all text content from
103// a given selection rectangle. The rectangle coordinates are
104// expressed in PDF user/page coordinate system.
105UString ReadTextFromRect(Page& page, const Rect& pos, ElementReader& reader)
106{
107 UString srch_str;
108 reader.Begin(page);
109 RectTextSearch(reader, pos, srch_str);
110 reader.End();
111 return srch_str;
112}
113
114
115void PrintStyle(TextExtractor::Style& s)
116{
117 UInt8 rgb[3];
118 char rgb_hex[24];
119
120 s.GetColor(rgb);
121 sprintf(rgb_hex, "%02X%02X%02X;", rgb[0], rgb[1], rgb[2]);
122 cout << " style=\"font-family:" << s.GetFontName() << "; " << "font-size:" << s.GetFontSize() << ";"
123 << (s.IsSerif() ? " sans-serif; " : " ") << "color:#" << rgb_hex << "\"";
124}
125
126int main(int argc, char *argv[])
127{
128 int ret = 0;
129 PDFNet::Initialize(LicenseKey);
130 // Relative path to the folder containing test files.
131 string input_path = "../../TestFiles/newsletter.pdf";
132
133
134
135
136 const char* filein = argc>1 ? argv[1] : input_path.c_str();
137
138 bool example1_basic = false;
139 bool example2_xml = false;
140 bool example3_wordlist = false;
141 bool example4_advanced = true;
142 bool example5_low_level = false;
143
144 // Sample code showing how to use high-level text extraction APIs.
145 try
146 {
147 PDFDoc doc(filein);
148 doc.InitSecurityHandler();
149
150 Page page = doc.GetPage(1);
151 if (!page){
152 cout << "Page not found." << endl;
153 return 1;
154 }
155
156 TextExtractor txt;
157 txt.Begin(page); // Read the page.
158 // Other options you may want to consider...
159 // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
160 // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
161
162
163 // Example 1. Get all text on the page in a single string.
164 // Words will be separated with space or new line characters.
165 if (example1_basic)
166 {
167 // Get the word count.
168 cout << "Word Count: " << txt.GetWordCount() << endl;
169
170 UString text;
171 txt.GetAsText(text);
172 cout << "\n\n- GetAsText --------------------------\n" << text << endl;
173 cout << "-----------------------------------------------------------" << endl;
174 }
175
176 // Example 2. Get XML logical structure for the page.
177 if (example2_xml)
178 {
179 UString text;
180 txt.GetAsXML(text, TextExtractor::e_words_as_elements | TextExtractor::e_output_bbox | TextExtractor::e_output_style_info);
181 cout << "\n\n- GetAsXML --------------------------\n" << text << endl;
182 cout << "-----------------------------------------------------------" << endl;
183 }
184
185 // Example 3. Extract words one by one.
186 if (example3_wordlist)
187 {
188 UString text;
189 TextExtractor::Line line = txt.GetFirstLine();
190 TextExtractor::Word word;
191 for (; line.IsValid(); line=line.GetNextLine()) {
192 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord()) {
193 text.Assign(word.GetString(), word.GetStringLen());
194 cout << text << '\n';
195 }
196 }
197 cout << "-----------------------------------------------------------" << endl;
198 }
199
200 // Example 4. A more advanced text extraction example.
201 // The output is XML structure containing paragraphs, lines, words,
202 // as well as style and positioning information.
203 if (example4_advanced)
204 {
205 const double *b;
206 double q[8];
207 int cur_flow_id=-1, cur_para_id=-1;
208
209 UString uni_str;
210 TextExtractor::Line line;
211 TextExtractor::Word word;
212 TextExtractor::Style s, line_style;
213
214 cout << "<PDFText>\n";
215
216 // For each line on the page...
217 for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
218 {
219 if ( line.GetNumWords() == 0 ) {
220 continue;
221 }
222
223 if (cur_flow_id != line.GetFlowID()) {
224 if (cur_flow_id != -1) {
225 if (cur_para_id != -1) {
226 cur_para_id = -1;
227 cout << "</Para>\n";
228 }
229 cout << "</Flow>\n";
230 }
231 cur_flow_id = line.GetFlowID();
232 cout << "<Flow id=\""<< cur_flow_id << "\">\n";
233 }
234
235 if (cur_para_id != line.GetParagraphID()) {
236 if (cur_para_id != -1)
237 cout << "</Para>\n";
238 cur_para_id = line.GetParagraphID();
239 cout << "<Para id=\""<< cur_para_id << "\">\n";
240 }
241
242 b = line.GetBBox();
243 line_style = line.GetStyle();
244 printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", b[0], b[1], b[2], b[3]);
245 PrintStyle(line_style);
246 cout << " cur_num=\"" << line.GetCurrentNum() << "\"";
247 cout << ">\n";
248
249 // For each word in the line...
250 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
251 {
252 // Output the bounding box for the word.
253 word.GetBBox(q);
254 printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", q[0], q[1], q[2], q[3]);
255 cout << " cur_num=\"" << word.GetCurrentNum() << "\"";
256 int sz = word.GetStringLen();
257 if (sz == 0) continue;
258
259 // If the word style is different from the parent style, output the new style.
260 s = word.GetStyle();
261 if (s != line_style) {
262 PrintStyle(s);
263 }
264
265 uni_str.Assign(word.GetString(), sz);
266 cout << ">" << uni_str;
267 cout << "</Word>\n";
268 }
269 cout << "</Line>\n";
270 }
271
272 if (cur_flow_id != -1) {
273 if (cur_para_id != -1) {
274 cur_para_id = -1;
275 cout << "</Para>\n";
276 }
277 cout << "</Flow>\n";
278 }
279 cout << "</PDFText>\n";
280 }
281 }
282 catch(Exception& e)
283 {
284 cout << e << endl;
285 ret = 1;
286 }
287 catch(...)
288 {
289 cout << "Unknown Exception" << endl;
290 ret = 1;
291 }
292
293
294 if(example5_low_level)
295 {
296 try
297 {
298 PDFDoc doc(filein);
299 doc.InitSecurityHandler();
300
301 // Example 1. Extract all text content from the document
302
303 ElementReader reader;
304 // Read every page
305 for (PageIterator itr=doc.GetPageIterator(); itr.HasNext(); itr.Next())
306 {
307 reader.Begin(itr.Current());
308 DumpAllText(reader);
309 reader.End();
310 }
311
312 // Example 2. Extract text content based on the
313 // selection rectangle.
314 cout << "\n----------------------------------------------------";
315 cout << "\nExtract text based on the selection rectangle.";
316 cout << "\n----------------------------------------------------\n";
317
318 Page first_page = doc.GetPageIterator().Current();
319 UString s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader);
320 cout << "\nField 1: " << s1;
321
322 s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
323 cout << "\nField 2: " << s1;
324
325 s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
326 cout << "\nField 3: " << s1;
327
328 // ...
329 cout << "Done." << endl;
330 }
331 catch(Exception& e)
332 {
333 cout << e << endl;
334 ret = 1;
335 }
336 catch(...)
337 {
338 cout << "Unknown Exception" << endl;
339 ret = 1;
340 }
341 }
342 PDFNet::Terminate();
343 return ret;
344}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales