Extract Image from PDFs - Python Sample Code

Sample code for using Apryse SDK to extract images from PDF files, along with their positioning information and DPI; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB. Instead of converting PDF images to a Bitmap, you can also extract uncompressed/compressed image data directly using element.GetImageData() (described in the PDF Data Extraction code sample).

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14#-----------------------------------------------------------------------------------
15# This sample illustrates one approach to PDF image extraction
16# using PDFNet.
17#
18# Note: Besides direct image export, you can also convert PDF images
19# to GDI+ Bitmap, or extract uncompressed/compressed image data directly
20# using element.GetImageData() (e.g. as illustrated in ElementReaderAdv
21# sample project).
22#-----------------------------------------------------------------------------------
23
24image_counter = 0
25
26# Relative path to the folder containing the test files.
27input_path = "../../TestFiles/"
28output_path = "../../TestFiles/Output/"
29
30def ImageExtract(reader):
31 element = reader.Next()
32 while element != None:
33 if (element.GetType() == Element.e_image or
34 element.GetType() == Element.e_inline_image):
35 global image_counter
36 image_counter =image_counter + 1
37 print("--> Image: " + str(image_counter))
38 print(" Width: " + str(element.GetImageWidth()))
39 print(" Height: " + str(element.GetImageHeight()))
40 print(" BPC: " + str(element.GetBitsPerComponent()))
41
42 ctm = element.GetCTM()
43 x2 = 1
44 y2 = 1
45 pt = Point(x2, y2)
46 point = ctm.Mult(pt)
47 print(" Coords: x1=%.2f, y1=%.2f, x2=%.2f, y2=%.2f" % (ctm.m_h, ctm.m_v, point.x, point.y))
48
49 if element.GetType() == Element.e_image:
50 image = Image(element.GetXObject())
51
52 fname = "image_extract1_" + str(image_counter)
53
54 path = output_path + fname
55 image.Export(path)
56
57 #path = output_path + fname + ".tif"
58 #image.ExportAsTiff(path)
59
60 #path = output_path + fname + ".png"
61 #image.ExportAsPng(path)
62 elif element.GetType() == Element.e_form:
63 reader.FormBegin()
64 ImageExtract(reader)
65 reader.End()
66 element = reader.Next()
67
68def main():
69 # Initialize PDFNet
70 PDFNet.Initialize(LicenseKey)
71
72 # Example 1:
73 # Extract images by traversing the display list for
74 # every page. With this approach it is possible to obtain
75 # image positioning information and DPI.
76
77 doc = PDFDoc(input_path + "newsletter.pdf")
78 doc.InitSecurityHandler()
79
80 reader = ElementReader()
81
82 # Read every page
83 itr = doc.GetPageIterator()
84 while itr.HasNext():
85 reader.Begin(itr.Current())
86 ImageExtract(reader)
87 reader.End()
88 itr.Next()
89
90 doc.Close()
91 print("Done.")
92
93 print("----------------------------------------------------------------")
94
95 # Example 2:
96 # Extract images by scanning the low-level document.
97
98 doc = PDFDoc(input_path + "newsletter.pdf")
99 doc.InitSecurityHandler()
100 image_counter= 0
101
102 cos_doc = doc.GetSDFDoc()
103 num_objs = cos_doc.XRefSize()
104 i = 1
105 while i < num_objs:
106 obj = cos_doc.GetObj(i)
107 if(obj is not None and not obj.IsFree() and obj.IsStream()):
108
109 # Process only images
110 itr = obj.Find("Type")
111
112 if not itr.HasNext() or not itr.Value().GetName() == "XObject":
113 i = i + 1
114 continue
115
116 itr = obj.Find("Subtype")
117 if not itr.HasNext() or not itr.Value().GetName() == "Image":
118 i = i + 1
119 continue
120
121 image = Image(obj)
122
123 image_counter = image_counter + 1
124 print("--> Image: " + str(image_counter))
125 print(" Width: " + str(image.GetImageWidth()))
126 print(" Height: " + str(image.GetImageHeight()))
127 print(" BPC: " + str(image.GetBitsPerComponent()))
128
129 fname = "image_extract2_" + str(image_counter)
130
131 path = output_path + fname
132 image.Export(path)
133
134 #path = output_path + fname + ".tif"
135 #image.ExportAsTiff(path)
136
137 #path = output_path + fname + ".png"
138 #image.ExportAsPng(path)
139 i = i + 1
140 doc.Close()
141 PDFNet.Terminate()
142 print("Done.")
143
144if __name__ == '__main__':
145 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales