PDF Data Extraction - Images, Text, Paths - C++ Sample Code

Sample code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Element.h>
10#include <PDF/Font.h>
11#include <Filters/FilterReader.h>
12#include <PDF/Image/Image2RGB.h>
13
14#include <iostream>
15#include <assert.h>
16#include "../../LicenseKey/CPP/LicenseKey.h"
17
18using namespace std;
19
20using namespace pdftron;
21using namespace PDF;
22using namespace SDF;
23using namespace Common;
24using namespace Filters;
25
26char m_buf[4000];
27
28void ProcessElements(ElementReader& reader);
29
30void ProcessPath(ElementReader& reader, Element path)
31{
32 if (path.IsClippingPath())
33 {
34 cout << "This is a clipping path" << endl;
35 }
36
37 PathData d = path.GetPathData();
38
39 const UChar* opr = &d.GetOperators().front();
40 const UChar *opr_itr = opr, *opr_end = opr + d.GetOperators().size();
41 const double* data = &d.GetPoints().front();
42 const double *data_itr = data, *data_end = data + d.GetPoints().size();
43
44 double x1, y1, x2, y2, x3, y3;
45
46 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
47
48 cout << " Path Data Points := \"";
49 for (; opr_itr<opr_end; ++opr_itr)
50 {
51 switch(*opr_itr)
52 {
53 case PathData::e_moveto:
54 x1 = *data_itr; ++data_itr;
55 y1 = *data_itr; ++data_itr;
56 sprintf(m_buf, "M%.0f %.0f", x1, y1);
57 cout << m_buf;
58 break;
59 case PathData::e_lineto:
60 x1 = *data_itr; ++data_itr;
61 y1 = *data_itr; ++data_itr;
62 sprintf(m_buf, " L%.0f %.0f", x1, y1);
63 cout << m_buf;
64 break;
65 case PathData::e_cubicto:
66 x1 = *data_itr; ++data_itr;
67 y1 = *data_itr; ++data_itr;
68 x2 = *data_itr; ++data_itr;
69 y2 = *data_itr; ++data_itr;
70 x3 = *data_itr; ++data_itr;
71 y3 = *data_itr; ++data_itr;
72 sprintf(m_buf, " C%.0f %.0f %.0f %.0f %.0f %.0f", x1, y1, x2, y2, x3, y3);
73 cout << m_buf;
74 break;
75 case PathData::e_rect:
76 {
77 x1 = *data_itr; ++data_itr;
78 y1 = *data_itr; ++data_itr;
79 double w = *data_itr; ++data_itr;
80 double h = *data_itr; ++data_itr;
81 x2 = x1 + w;
82 y2 = y1;
83 x3 = x2;
84 y3 = y1 + h;
85 double x4 = x1;
86 double y4 = y3;
87 sprintf(m_buf, "M%.0f %.0f L%.0f %.0f L%.0f %.0f L%.0f %.0f Z",
88 x1, y1, x2, y2, x3, y3, x4, y4);
89 cout << m_buf;
90 }
91 break;
92 case PathData::e_closepath:
93 cout << " Close Path" << endl;
94 break;
95 default:
96 assert(false);
97 break;
98 }
99 }
100
101 cout << "\" ";
102
103 GState gs = path.GetGState();
104
105 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
106 if (path.IsStroked())
107 {
108 cout << "Stroke path" << endl;
109
110 if (gs.GetStrokeColorSpace().GetType() == ColorSpace::e_pattern)
111 {
112 cout << "Path has associated pattern" << endl;
113 }
114 else
115 {
116 // Get stroke color (you can use PDFNet color conversion facilities)
117 // ColorPt rgb;
118 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
119 }
120 }
121 else
122 {
123 // Do not stroke path
124 }
125
126 if (path.IsFilled())
127 {
128 cout << "Fill path" << endl;
129
130 if (gs.GetFillColorSpace().GetType() == ColorSpace::e_pattern)
131 {
132 cout << "Path has associated pattern" << endl;
133 }
134 else
135 {
136 // ColorPt rgb;
137 // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
138 }
139 }
140 else
141 {
142 // Do not fill path
143 }
144
145 // Process any changes in graphics state ---------------------------------
146
147 GSChangesIterator gs_itr = reader.GetChangesIterator();
148 for (; gs_itr.HasNext(); gs_itr.Next())
149 {
150 switch(gs_itr.Current())
151 {
152 case GState::e_transform :
153 // Get transform matrix for this element. Unlike path.GetCTM()
154 // that return full transformation matrix gs.GetTransform() return
155 // only the transformation matrix that was installed for this element.
156 //
157 // gs.GetTransform();
158 break;
159 case GState::e_line_width :
160 // gs.GetLineWidth();
161 break;
162 case GState::e_line_cap :
163 // gs.GetLineCap();
164 break;
165 case GState::e_line_join :
166 // gs.GetLineJoin();
167 break;
168 case GState::e_flatness :
169 break;
170 case GState::e_miter_limit :
171 // gs.GetMiterLimit();
172 break;
173 case GState::e_dash_pattern :
174 {
175 // std::vector<double> dashes;
176 // gs.GetDashes(dashes);
177 // gs.GetPhase()
178 }
179 break;
180 case GState::e_fill_color:
181 {
182 if ( gs.GetFillColorSpace().GetType() == ColorSpace::e_pattern &&
183 gs.GetFillPattern().GetType() != PatternColor::e_shading )
184 {
185 //process the pattern data
186 reader.PatternBegin(true);
187 ProcessElements(reader);
188 reader.End();
189 }
190 }
191 break;
192 }
193 }
194 reader.ClearChangeList();
195}
196
197void ProcessText(ElementReader& page_reader)
198{
199 // Begin text element
200 cout << "Begin Text Block:" << endl;
201
202 Element element;
203 while ((element = page_reader.Next()) != 0)
204 {
205 switch (element.GetType())
206 {
207 case Element::e_text_end:
208 // Finish the text block
209 cout << "End Text Block." << endl;
210 return;
211
212 case Element::e_text:
213 {
214 GState gs = element.GetGState();
215
216 ColorSpace cs_fill = gs.GetFillColorSpace();
217 ColorPt fill = gs.GetFillColor();
218
219 ColorPt out;
220 cs_fill.Convert2RGB(fill, out);
221
222
223 ColorSpace cs_stroke = gs.GetStrokeColorSpace();
224 ColorPt stroke = gs.GetStrokeColor();
225
226 Font font = gs.GetFont();
227
228 cout << "Font Name: " << font.GetName() << endl;
229 // font.IsFixedWidth();
230 // font.IsSerif();
231 // font.IsSymbolic();
232 // font.IsItalic();
233 // ...
234
235 // double font_size = gs.GetFontSize();
236 // double word_spacing = gs.GetWordSpacing();
237 // double char_spacing = gs.GetCharSpacing();
238 // const UString* txt = element.GetTextString();
239
240 if ( font.GetType() == Font::e_Type3 )
241 {
242 //type 3 font, process its data
243 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
244 {
245 page_reader.Type3FontBegin(itr.Current());
246 ProcessElements(page_reader);
247 page_reader.End();
248 }
249 }
250
251 else
252 {
253 Matrix2D text_mtx = element.GetTextMatrix();
254 double x, y;
255 unsigned int char_code;
256
257 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
258 {
259 cout << "Character code: ";
260 char_code = itr.Current().char_code;
261 if (char_code>=32 || char_code<=127)
262 {
263 // Print if in ASCII range...
264 cout << char(char_code);
265 }
266
267 x = itr.Current().x; // character positioning information
268 y = itr.Current().y;
269
270 // Use element.GetCTM() if you are interested in the CTM
271 // (current transformation matrix).
272 Matrix2D ctm = element.GetCTM();
273
274 // To get the exact character positioning information you need to
275 // concatenate current text matrix with CTM and then multiply
276 // relative positioning coordinates with the resulting matrix.
277 Matrix2D mtx = ctm * text_mtx;
278 mtx.Mult(x, y);
279
280 // Get glyph path...
281 //vector<UChar> oprs;
282 //vector<double> glyph_data;
283 //font.GetGlyphPath(char_code, oprs, glyph_data, false, 0);
284 }
285 }
286
287 cout << endl;
288 }
289 break;
290 }
291 }
292}
293
294void ProcessImage(Element image)
295{
296 bool image_mask = image.IsImageMask();
297 bool interpolate = image.IsImageInterpolate();
298 int width = image.GetImageWidth();
299 int height = image.GetImageHeight();
300 int out_data_sz = width * height * 3;
301
302 cout << "Image:"
303 << " width=\"" << width << "\""
304 << " height=\"" << height << "\"" << endl;
305
306 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
307
308 // You can use GetImageData to read the raw (decoded) image data
309 //image->GetBitsPerComponent();
310 //image->GetImageData(); // get raw image data
311 // .... or use Image2RGB filter that converts every image to RGB format,
312 // This should save you time since you don't need to deal with color conversions,
313 // image up-sampling, decoding etc.
314
315 Image2RGB img_conv(image); // Extract and convert image to RGB 8-bpc format
316 FilterReader reader(img_conv);
317
318 // A buffer used to keep image data.
319 std::vector<UChar> image_data_out;
320 image_data_out.resize(out_data_sz);
321
322 reader.Read(&image_data_out.front(), out_data_sz);
323 // &image_data_out.front() contains RGB image data.
324
325 // Note that you don't need to read a whole image at a time. Alternatively
326 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
327 // until the function returns 0.
328}
329
330void ProcessElements(ElementReader& reader)
331{
332 Element element;
333 while ((element = reader.Next()) != 0) // Read page contents
334 {
335 switch (element.GetType())
336 {
337 case Element::e_path: // Process path data...
338 {
339 ProcessPath(reader, element);
340 }
341 break;
342 case Element::e_text_begin: // Process text block...
343 {
344 ProcessText(reader);
345 }
346 break;
347 case Element::e_form: // Process form XObjects
348 {
349 reader.FormBegin();
350 ProcessElements(reader);
351 reader.End();
352 }
353 break;
354 case Element::e_image: // Process Images
355 {
356 ProcessImage(element);
357 }
358 break;
359 }
360 }
361}
362
363int main(int argc, char *argv[])
364{
365 int ret = 0;
366 PDFNet::Initialize(LicenseKey);
367
368 // Relative path to the folder containing test files.
369 string input_path = "../../TestFiles/";
370 // string output_path = "../../TestFiles/Output/";
371
372
373 try // Extract text data from all pages in the document
374 {
375 cout << "-------------------------------------------------" << endl;
376 cout << "Extract page element information from all " << endl;
377 cout << "pages in the document." << endl;
378
379 PDFDoc doc((input_path + "newsletter.pdf").c_str());
380 doc.InitSecurityHandler();
381
382 int pgnum = doc.GetPageCount();
383 PageIterator page_begin = doc.GetPageIterator();
384
385 ElementReader page_reader;
386
387 PageIterator itr;
388 for (itr = page_begin; itr.HasNext(); itr.Next()) // Read every page
389 {
390 cout << "Page " << itr.Current().GetIndex() << "----------------------------------------" << endl;
391 page_reader.Begin(itr.Current());
392 ProcessElements(page_reader);
393 page_reader.End();
394 }
395
396 cout << "Done." << endl;
397 }
398 catch(Exception& e)
399 {
400 cout << e << endl;
401 ret = 1;
402 }
403 catch(...)
404 {
405 cout << "Unknown Exception" << endl;
406 ret = 1;
407 }
408
409 PDFNet::Terminate();
410 return ret;
411}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales