Extract Image from PDFs - C++ Sample Code

Sample code for using Apryse SDK to extract images from PDF files, along with their positioning information and DPI; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB. Instead of converting PDF images to a Bitmap, you can also extract uncompressed/compressed image data directly using element.GetImageData() (described in the PDF Data Extraction code sample). Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Image.h>
10#include "../../LicenseKey/CPP/LicenseKey.h"
11
12//-----------------------------------------------------------------------------------
13// This sample illustrates one approach to PDF image extraction
14// using PDFNet.
15//
16// Note: Besides direct image export, you can also convert PDF images
17// to GDI+ Bitmap, or extract uncompressed/compressed image data directly
18// using element.GetImageData() (e.g. as illustrated in ElementReaderAdv
19// sample project).
20//-----------------------------------------------------------------------------------
21
22#include <iostream>
23#include <iomanip>
24
25using namespace std;
26
27using namespace pdftron;
28using namespace Common;
29using namespace SDF;
30using namespace PDF;
31
32// Relative paths to folders containing test files.
33string input_path = "../../TestFiles/";
34string output_path = "../../TestFiles/Output/";
35
36int image_counter = 0;
37
38void ImageExtract(ElementReader& reader)
39{
40 // Set the precision for printing doubles on cout to 3 decimal places.
41 ios iostate(NULL);
42 iostate.copyfmt(cout);
43 cout << fixed << showpoint << setprecision(3);
44
45 Element element;
46 while ((element = reader.Next()) != 0)
47 {
48 switch (element.GetType())
49 {
50 case Element::e_image:
51 case Element::e_inline_image:
52 {
53 cout << "--> Image: " << ++image_counter << endl;
54 cout << " Width: " << element.GetImageWidth() << endl;
55 cout << " Height: " << element.GetImageHeight() << endl;
56 cout << " BPC: " << element.GetBitsPerComponent() << endl;
57
58 Common::Matrix2D ctm = element.GetCTM();
59 double x2=1, y2=1;
60 ctm.Mult(x2, y2);
61 printf(" Coords: x1=%.2f, y1=%.2f, x2=%.2f, y2=%.2f\n", ctm.m_h, ctm.m_v, x2, y2);
62
63 if (element.GetType() == Element::e_image)
64 {
65 Image image(element.GetXObject());
66
67 char fname[256];
68 sprintf(fname, "image_extract1_%d", image_counter);
69
70 string path(output_path + fname);
71 image.Export(path.c_str());
72
73 //string path(output_path + fname + ".tif");
74 //image.ExportAsTiff(path.c_str());
75
76 //string path(output_path + fname + ".png");
77 //image.ExportAsPng(path.c_str());
78 }
79 }
80 break;
81 case Element::e_form: // Process form XObjects
82 reader.FormBegin();
83 ImageExtract(reader);
84 reader.End();
85 break;
86 }
87 }
88
89 // Reset cout's state.
90 cout.copyfmt(iostate);
91}
92
93int main(int argc, char *argv[])
94{
95 int ret = 0;
96
97 // Initialize PDFNet
98 PDFNet::Initialize(LicenseKey);
99
100 // Example 1:
101 // Extract images by traversing the display list for
102 // every page. With this approach it is possible to obtain
103 // image positioning information and DPI.
104 try
105 {
106 PDFDoc doc((input_path + "newsletter.pdf").c_str());
107 doc.InitSecurityHandler();
108 ElementReader reader;
109 // Read every page
110 for (PageIterator itr=doc.GetPageIterator(); itr.HasNext(); itr.Next())
111 {
112 reader.Begin(itr.Current());
113 ImageExtract(reader);
114 reader.End();
115 }
116
117 cout << "Done." << endl;
118 }
119 catch(Common::Exception& e)
120 {
121 cout << e << endl;
122 ret = 1;
123 }
124 catch(...)
125 {
126 cout << "Unknown Exception" << endl;
127 ret = 1;
128 }
129
130 cout << "----------------------------------------------------------------" << endl;
131
132 // Example 2:
133 // Extract images by scanning the low-level document.
134 try
135 {
136 PDFDoc doc((input_path + "newsletter.pdf").c_str());
137
138 doc.InitSecurityHandler();
139 image_counter = 0;
140
141 SDFDoc& cos_doc=doc.GetSDFDoc();
142 int num_objs = cos_doc.XRefSize();
143 for(int i=1; i<num_objs; ++i)
144 {
145 Obj obj = cos_doc.GetObj(i);
146 if(obj && !obj.IsFree() && obj.IsStream())
147 {
148 // Process only images
149 DictIterator itr = obj.Find("Type");
150 if(!itr.HasNext() || strcmp(itr.Value().GetName(), "XObject"))
151 continue;
152
153 itr = obj.Find("Subtype");
154 if(!itr.HasNext() || strcmp(itr.Value().GetName(), "Image"))
155 continue;
156
157 PDF::Image image(obj);
158 cout << "--> Image: " << ++image_counter << endl;
159 cout << " Width: " << image.GetImageWidth() << endl;
160 cout << " Height: " << image.GetImageHeight() << endl;
161 cout << " BPC: " << image.GetBitsPerComponent() << endl;
162
163 char fname[256];
164 sprintf(fname, "image_extract2_%d", image_counter);
165 string path(output_path + fname);
166 image.Export(path.c_str());
167
168 //string path(output_path + fname + ".tif");
169 //image.ExportAsTiff(path.c_str());
170
171 //string path(output_path + fname + ".png");
172 //image.ExportAsPng(path.c_str());
173 }
174 }
175
176 cout << "Done." << endl;
177 }
178 catch(Common::Exception& e)
179 {
180 cout << e << endl;
181 ret = 1;
182 }
183 catch(...)
184 {
185 cout << "Unknown Exception" << endl;
186 ret = 1;
187 }
188
189 PDFNet::Terminate();
190 return ret;
191}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales