Extract Image from PDFs - ImageExtract

Sample code for using Apryse SDK to extract images from PDF files, along with their positioning information and DPI; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB. Instead of converting PDF images to a Bitmap, you can also extract uncompressed/compressed image data directly using element.GetImageData() (described in the PDF Data Extraction code sample).

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using System.Drawing;
7using System.Drawing.Imaging;
8
9using pdftron;
10using pdftron.Common;
11using pdftron.PDF;
12using pdftron.SDF;
13using pdftron.Filters;
14
15namespace ImageExtractTestCS
16{
17 class Class1
18 {
19 /// <summary>
20 ///-----------------------------------------------------------------------------------
21 /// This sample illustrates one approach to PDF image extraction
22 /// using PDFNet.
23 ///
24 /// Note: Besides direct image export, you can also convert PDF images
25 /// to GDI+ Bitmap, or extract uncompressed/compressed image data directly
26 /// using element.GetImageData() (e.g. as illustrated in ElementReaderAdv
27 /// sample project).
28 ///-----------------------------------------------------------------------------------
29 /// </summary>
30
31 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
32 static Class1() {}
33
34 static int image_counter = 0;
35
36 // Relative path to the folder containing test files.
37 static string input_path = "../../../../TestFiles/";
38 static string output_path = "../../../../TestFiles/Output/";
39
40 static void ImageExtract(PDFDoc doc, ElementReader reader)
41 {
42 Element element;
43 while ((element = reader.Next()) != null)
44 {
45 switch (element.GetType())
46 {
47 case Element.Type.e_image:
48 case Element.Type.e_inline_image:
49 {
50 Console.WriteLine("--> Image: {0}", ++image_counter);
51 Console.WriteLine(" Width: {0}", element.GetImageWidth());
52 Console.WriteLine(" Height: {0}", element.GetImageHeight());
53 Console.WriteLine(" BPC: {0}", element.GetBitsPerComponent());
54
55 Matrix2D ctm = element.GetCTM();
56 double x2=1, y2=1, y1=ctm.m_v;
57 ctm.Mult(ref x2, ref y2);
58 // Write the coords to 3 decimal places.
59 Console.WriteLine(" Coords: x1={0:N2}, y1={1:N2}, x2={2:N2}, y2={3:N2}", ctm.m_h, ctm.m_v, x2, y2);
60 pdftron.PDF.Image image = null;
61 if (element.GetType() == Element.Type.e_image)
62 {
63 image = new pdftron.PDF.Image(element.GetXObject());
64
65 string fname = output_path + "image_extract1_" + image_counter.ToString();
66 image.Export(fname); // or ExporAsPng() or ExporAsTiff() ...
67 }
68 break;
69 }
70 case Element.Type.e_form: // Process form XObjects
71 {
72 reader.FormBegin();
73 ImageExtract(doc, reader);
74 reader.End();
75 break;
76 }
77 }
78 }
79 }
80
81 static void Main(string[] args)
82 {
83 PDFNet.Initialize(PDFTronLicense.Key);
84
85 // Example 1:
86 // Extract images by traversing the display list for
87 // every page. With this approach it is possible to obtain
88 // image positioning information and DPI.
89 try
90 {
91 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
92 using (ElementReader reader = new ElementReader())
93 {
94 doc.InitSecurityHandler();
95 PageIterator itr;
96 for (itr=doc.GetPageIterator(); itr.HasNext(); itr.Next())
97 {
98 reader.Begin(itr.Current());
99 ImageExtract(doc, reader);
100 reader.End();
101 }
102
103 Console.WriteLine("Done.");
104 }
105 }
106 catch (PDFNetException e)
107 {
108 Console.WriteLine(e.Message);
109 }
110
111 Console.WriteLine("----------------------------------------------------------------");
112
113 // Example 2:
114 // Extract images by scanning the low-level document.
115 try
116 {
117 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
118 {
119 doc.InitSecurityHandler();
120 image_counter = 0;
121
122 SDFDoc cos_doc = doc.GetSDFDoc();
123 int num_objs = cos_doc.XRefSize();
124 for (int i=1; i<num_objs; ++i)
125 {
126 Obj obj = cos_doc.GetObj(i);
127 if (obj!=null && !obj.IsFree()&& obj.IsStream())
128 {
129 // Process only images
130 DictIterator itr = obj.Find("Subtype");
131 if (!itr.HasNext() || itr.Value().GetName() != "Image")
132 continue;
133
134 itr = obj.Find("Type");
135 if (!itr.HasNext() || itr.Value().GetName() != "XObject")
136 continue;
137
138 pdftron.PDF.Image image = new pdftron.PDF.Image(obj);
139
140 Console.WriteLine("--> Image: {0}", ++image_counter);
141 Console.WriteLine(" Width: {0}", image.GetImageWidth());
142 Console.WriteLine(" Height: {0}", image.GetImageHeight());
143 Console.WriteLine(" BPC: {0}", image.GetBitsPerComponent());
144
145 string fname = output_path + "image_extract2_" + image_counter.ToString();
146 image.Export(fname); // or ExporAsPng() or ExporAsTiff() ...
147
148 // Convert PDF bitmap to GDI+ Bitmap...
149 //Bitmap bmp = image.GetBitmap();
150 //bmp.Save(fname, ImageFormat.Png);
151 //bmp.Dispose();
152
153 // Instead of converting PDF images to a Bitmap, you can also extract
154 // uncompressed/compressed image data directly using element.GetImageData()
155 // as illustrated in ElementReaderAdv sample project.
156 }
157 }
158 Console.WriteLine("Done.");
159 }
160 }
161 catch (PDFNetException e)
162 {
163 Console.WriteLine(e.Message);
164 }
165 PDFNet.Terminate();
166
167 }
168 }
169}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales