PDF Data Extraction - Images, Text, Paths - Python Sample Code

Sample code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14def ProcessPath(reader, path):
15 if path.IsClippingPath():
16 print("This is a clipping path")
17
18 pathData = path.GetPathData()
19 data = pathData.GetPoints()
20 opr = pathData.GetOperators()
21
22 opr_index = 0
23 opr_end = len(opr)
24 data_index = 0
25 data_end = len(data)
26
27 # Use path.GetCTM() if you are interested in CTM (current transformation matrix).
28
29 sys.stdout.write("Path Data Points := \"")
30
31 while opr_index < opr_end:
32 if opr[opr_index] == PathData.e_moveto:
33 x1 = data[data_index]
34 data_index = data_index + 1
35 y1 = data[data_index]
36 data_index = data_index + 1
37 sys.stdout.write("M" + str(x1) + " " + str(y1))
38 elif opr[opr_index] == PathData.e_lineto:
39 x1 = data[data_index]
40 data_index = data_index + 1
41 y1 = data[data_index]
42 data_index = data_index + 1
43 sys.stdout.write(" L" + str(x1) + " " + str(y1))
44 elif opr[opr_index] == PathData.e_cubicto:
45 x1 = data[data_index]
46 data_index = data_index + 1
47 y1 = data[data_index]
48 data_index = data_index + 1
49 x2 = data[data_index]
50 data_index = data_index + 1
51 y2 = data[data_index]
52 data_index = data_index + 1
53 x3 = data[data_index]
54 data_index = data_index + 1
55 y3 = data[data_index]
56 data_index = data_index + 1
57 sys.stdout.write(" C" + str(x1) + " " + str(y1) + " " + str(x2) +
58 " " + str(y2) + " " + str(x3) + " " + str(y3))
59 elif opr[opr_index] == PathData.e_rect:
60 x1 = data[data_index]
61 data_index = data_index + 1
62 y1 = data[data_index]
63 data_index = data_index + 1
64 w = data[data_index]
65 data_index = data_index + 1
66 h = data[data_index]
67 data_index = data_index + 1
68 x2 = x1 + w
69 y2 = y1
70 x3 = x2
71 y3 = y1 + h
72 x4 = x1
73 y4 = y3
74 sys.stdout.write("M" + str(x1) + " " + str(y1) + " L" + str(x2) + " " + str(y2) + " L" +
75 str(x3) + " " + str(y3) + " L" + str(x4) + " " + str(y4) + " Z")
76 elif opr[opr_index] == PathData.e_closepath:
77 print(" Close Path")
78 else:
79 assert(False)
80 opr_index = opr_index + 1
81
82 sys.stdout.write("\" ")
83 gs = path.GetGState()
84
85 # Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
86 if path.IsStroked():
87 print("Stroke path")
88
89 if (gs.GetStrokeColorSpace().GetType() == ColorSpace.e_pattern):
90 print("Path has associated pattern")
91 else:
92 # Get stroke color (you can use PDFNet color conversion facilities)
93 # rgb = gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor())
94 pass
95 else:
96 pass;
97 # Do not stroke path
98
99 if path.IsFilled():
100 print("Fill path")
101
102 if (gs.GetFillColorSpace().GetType() == ColorSpace.e_pattern):
103 print("Path has associated pattern")
104 else:
105 # rgb = gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor())
106 pass
107 else:
108 pass
109 # Do not fill path
110
111 # Process any changes in graphics state ---------------------------------
112 gs_itr = reader.GetChangesIterator()
113 while gs_itr.HasNext():
114 if gs_itr.Current() == GState.e_transform:
115 # Get transform matrix for this element. Unlike path.GetCTM()
116 # that return full transformation matrix gs.GetTransform() return
117 # only the transformation matrix that was installed for this element.
118 #
119 # gs.GetTransform()
120 pass
121 elif gs_itr.Current() == GState.e_line_width:
122 # gs.GetLineWidth()
123 pass
124 elif gs_itr.Current() == GState.e_line_cap:
125 # gs.GetLineCap()
126 pass
127 elif gs_itr.Current() == GState.e_line_join:
128 # gs.GetLineJoin()
129 pass
130 elif gs_itr.Current() == GState.e_flatness:
131 pass
132 elif gs_itr.Current() == GState.e_miter_limit:
133 # gs.GetMiterLimit()
134 pass
135 elif gs_itr.Current() == GState.e_dash_pattern:
136 # dashes = gs.GetDashes()
137 # gs.GetPhase()
138 pass
139 elif gs_itr.Current() == GState.e_fill_color:
140 if (gs.GetFillColorSpace().GetType() == ColorSpace.e_pattern and
141 gs.GetFillPattern().GetType() != PatternColor.e_shading ):
142 # process the pattern data
143 reader.PatternBegin(True)
144 ProcessElements(reader)
145 reader.End()
146 gs_itr.Next()
147 reader.ClearChangeList()
148
149def ProcessText (page_reader):
150 # Begin text element
151 print("Begin Text Block:")
152
153 element = page_reader.Next()
154
155 while element != None:
156 type = element.GetType()
157 if type == Element.e_text_end:
158 # Finish the text block
159 print("End Text Block.")
160 return
161 elif type == Element.e_text:
162 gs = element.GetGState()
163
164 cs_fill = gs.GetFillColorSpace()
165 fill = gs.GetFillColor()
166
167 out = cs_fill.Convert2RGB(fill)
168
169 cs_stroke = gs.GetStrokeColorSpace()
170 stroke = gs.GetStrokeColor()
171
172 font = gs.GetFont()
173 print("Font Name: " + font.GetName())
174 # font.IsFixedWidth()
175 # font.IsSerif()
176 # font.IsSymbolic()
177 # font.IsItalic()
178 # ...
179
180 # font_size = gs.GetFontSize()
181 # word_spacing = gs.GetWordSpacing()
182 # char_spacing = gs.GetCharSpacing()
183 # txt = element.GetTextString()
184 if font.GetType() == Font.e_Type3:
185 # type 3 font, process its data
186 itr = element.GetCharIterator()
187 while itr.HasNext():
188 page_reader.Type3FontBegin(itr.Current())
189 ProcessElements(page_reader)
190 page_reader.End()
191 else:
192 text_mtx = element.GetTextMatrix()
193
194 itr = element.GetCharIterator()
195 while itr.HasNext():
196 char_code = itr.Current().char_code
197 if char_code>=32 and char_code<=255: # Print if in ASCII range...
198 a = font.MapToUnicode(char_code)
199 sys.stdout.write( a[0] if sys.version_info.major < 3 else ascii(a[0]) )
200
201 pt = Point()
202 pt.x = itr.Current().x # character positioning information
203 pt.y = itr.Current().y
204
205 # Use element.GetCTM() if you are interested in the CTM
206 # (current transformation matrix).
207 ctm = element.GetCTM()
208
209 # To get the exact character positioning information you need to
210 # concatenate current text matrix with CTM and then multiply
211 # relative positioning coordinates with the resulting matrix.
212 mtx = ctm.Multiply(text_mtx)
213 mtx.Mult(pt)
214 itr.Next()
215 print("")
216 element = page_reader.Next()
217
218def ProcessImage (image):
219 image_mask = image.IsImageMask()
220 interpolate = image.IsImageInterpolate()
221 width = image.GetImageWidth()
222 height = image.GetImageHeight()
223 out_data_sz = width * height * 3
224
225 print("Image: width=\"" + str(width) + "\"" + " height=\"" + str(height))
226
227 # Matrix2D& mtx = image->GetCTM() # image matrix (page positioning info)
228
229 # You can use GetImageData to read the raw (decoded) image data
230 #image->GetBitsPerComponent()
231 #image->GetImageData() # get raw image data
232 # .... or use Image2RGB filter that converts every image to RGB format,
233 # This should save you time since you don't need to deal with color conversions,
234 # image up-sampling, decoding etc.
235
236 img_conv = Image2RGB(image) # Extract and convert image to RGB 8-bps format
237 reader = FilterReader(img_conv)
238
239 image_data_out = reader.Read(out_data_sz)
240
241 # Note that you don't need to read a whole image at a time. Alternatively
242 # you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
243 # until the function returns 0.
244
245def ProcessElements(reader):
246 element = reader.Next() # Read page contents
247 while element != None:
248 type = element.GetType()
249 if type == Element.e_path: # Process path data...
250 ProcessPath(reader, element)
251 elif type == Element.e_text_begin: # Process text block...
252 ProcessText(reader)
253 elif type == Element.e_form: # Process form XObjects
254 reader.FormBegin()
255 ProcessElements(reader)
256 reader.End()
257 elif type == Element.e_image: # Process Images
258 ProcessImage(element)
259 element = reader.Next()
260
261if __name__ == '__main__':
262 PDFNet.Initialize(LicenseKey)
263
264 # Relative path to the folder containing the test files.
265 input_path = "../../TestFiles/"
266 output_path = "../../TestFiles/Output/"
267
268 # Extract text data from all pages in the document
269
270 print("__________________________________________________")
271 print("Extract page element information from all ")
272 print("pages in the document.")
273
274 doc = PDFDoc(input_path + "newsletter.pdf")
275 doc.InitSecurityHandler()
276 pgnum = doc.GetPageCount()
277 page_begin = doc.GetPageIterator()
278 page_reader = ElementReader()
279
280 itr = page_begin
281 while itr.HasNext(): # Read every page
282 print("Page " + str(itr.Current().GetIndex()) + "----------------------------------------")
283 page_reader.Begin(itr.Current())
284 ProcessElements(page_reader)
285 page_reader.End()
286 itr.Next()
287 doc.Close()
288 PDFNet.Terminate()
289 print("Done.")

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales