PDF Data Extraction - Images, Text, Paths - Ruby Sample Code

Sample code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12def ProcessPath(reader, path)
13 if path.IsClippingPath
14 puts "This is a clipping path"
15 end
16
17 pathData = path.GetPathData
18 data = pathData.GetPoints
19 opr = pathData.GetOperators
20
21 opr_index = 0
22 opr_end = opr.size
23 data_index = 0
24 data_end = data.size
25
26 # Use path.GetCTM if you are interested in CTM (current transformation matrix).
27 print "Path Data Points := \""
28
29 while opr_index < opr_end
30 case opr[opr_index].ord
31 when PathData::E_moveto
32 x1 = data[data_index]
33 data_index = data_index + 1
34 y1 = data[data_index]
35 data_index = data_index + 1
36 puts "M" + x1.to_s + " " + y1.to_s
37 when PathData::E_lineto
38 x1 = data[data_index]
39 data_index = data_index + 1
40 y1 = data[data_index]
41 data_index = data_index + 1
42 print " L" + x1.to_s + " " + y1.to_s
43 when PathData::E_cubicto
44 x1 = data[data_index]
45 data_index = data_index + 1
46 y1 = data[data_index]
47 data_index = data_index + 1
48 x2 = data[data_index]
49 data_index = data_index + 1
50 y2 = data[data_index]
51 data_index = data_index + 1
52 x3 = data[data_index]
53 data_index = data_index + 1
54 y3 = data[data_index]
55 data_index = data_index + 1
56 print " C" + x1.to_s + " " + y1.to_s + " " + x2.to_s +
57 " " + y2.to_s + " " + x3.to_s + " " + y3.to_s
58 when PathData::E_rect
59 x1 = data[data_index]
60 data_index = data_index + 1
61 y1 = data[data_index]
62 data_index = data_index + 1
63 w = data[data_index]
64 data_index = data_index + 1
65 h = data[data_index]
66 data_index = data_index + 1
67 x2 = x1 + w
68 y2 = y1
69 x3 = x2
70 y3 = y1 + h
71 x4 = x1
72 y4 = y3
73 print "M" + x1.to_s + " " + y1.to_s + " L " + x2.to_s + " " + y2.to_s + " L " +
74 x3.to_s + " " + y3.to_s + " L " + x4.to_s + " " + y4.to_s + " Z"
75 when PathData::E_closepath
76 puts " Close Path"
77 else
78 raise "Assert: false"
79 end
80 opr_index = opr_index + 1
81 end
82
83 print "\" "
84 gs = path.GetGState
85
86 # Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
87 if path.IsStroked
88 puts "Stroke path"
89
90 if gs.GetStrokeColorSpace.GetType == ColorSpace::E_pattern
91 puts "Path has associated pattern"
92 else
93 # Get stroke color (you can use PDFNet color conversion facilities)
94 # rgb = gs.GetStrokeColorSpace.Convert2RGB(gs.GetStrokeColor)
95 end
96 else
97 # Do not stroke path
98 end
99
100 if path.IsFilled
101 puts "Fill path"
102
103 if gs.GetFillColorSpace.GetType == ColorSpace::E_pattern
104 puts "Path has associated pattern"
105 else
106 # rgb = gs.GetFillColorSpace.Convert2RGB(gs.GetFillColor)
107 end
108 else
109 # Do not fill path
110 end
111
112 # Process any changes in graphics state ---------------------------------
113 gs_itr = reader.GetChangesIterator
114 while gs_itr.HasNext do
115 case gs_itr.Current
116 when GState::E_transform
117 # Get transform matrix for this element. Unlike path.GetCTM
118 # that return full transformation matrix gs.GetTransform return
119 # only the transformation matrix that was installed for this element.
120 #
121 # gs.GetTransform
122 when GState::E_line_width
123 # gs.GetLineWidth
124 when GState::E_line_cap
125 # gs.GetLineCap
126 when GState::E_line_join
127 # gs.GetLineJoin
128 when GState::E_flatness
129 when GState::E_miter_limit
130 # gs.GetMiterLimit
131 when GState::E_dash_pattern
132 # dashes = gs.GetDashes
133 # gs.GetPhase
134 when GState::E_fill_color
135 if (gs.GetFillColorSpace.GetType == ColorSpace::E_pattern and
136 gs.GetFillPattern.GetType != PatternColor::E_shading )
137 # process the pattern data
138 reader.PatternBegin(true)
139 ProcessElements(reader)
140 reader.End
141 end
142 end
143 gs_itr.Next
144 end
145 reader.ClearChangeList
146end
147
148def ProcessText (page_reader)
149 # Begin text element
150 puts "Begin Text Block:"
151
152 element = page_reader.Next
153
154 while !element.nil?
155 type = element.GetType
156 if type == Element::E_text_end
157 # Finish the text block
158 puts "End Text Block."
159 return
160 elsif type == Element::E_text
161 gs = element.GetGState
162
163 cs_fill = gs.GetFillColorSpace
164 fill = gs.GetFillColor
165
166 out = cs_fill.Convert2RGB(fill)
167
168 cs_stroke = gs.GetStrokeColorSpace
169 stroke = gs.GetStrokeColor
170
171 font = gs.GetFont
172 puts "Font Name: " + font.GetName
173 # font.IsFixedWidth
174 # font.IsSerif
175 # font.IsSymbolic
176 # font.IsItalic
177 # ...
178
179 # font_size = gs.GetFontSize
180 # word_spacing = gs.GetWordSpacing
181 # char_spacing = gs.GetCharSpacing
182 # txt = element.GetTextString
183 if font.GetType == Font::E_Type3
184 # type 3 font, process its data
185 itr = element.GetCharIterator
186 while itr.HasNext do
187 page_reader.Type3FontBegin(itr.Current)
188 ProcessElements(page_reader)
189 page_reader.End
190 end
191 else
192 text_mtx = element.GetTextMatrix
193
194 itr = element.GetCharIterator
195 while itr.HasNext do
196 char_code = itr.Current.char_code
197 if char_code>=32 and char_code<=255 # Print if in ASCII range...
198 a = font.MapToUnicode(char_code)
199 print a[0]
200 end
201
202 pt = Point.new
203 pt.x = itr.Current.x # character positioning information
204 pt.y = itr.Current.y
205
206 # Use element.GetCTM if you are interested in the CTM
207 # (current transformation matrix).
208 ctm = element.GetCTM
209
210 # To get the exact character positioning information you need to
211 # concatenate current text matrix with CTM and then multiply
212 # relative positioning coordinates with the resulting matrix.
213 mtx = ctm.Multiply(text_mtx)
214 mtx.Mult(pt)
215 itr.Next
216 end
217 end
218 puts ""
219 end
220 element = page_reader.Next
221 end
222end
223
224def ProcessImage (image)
225 image_mask = image.IsImageMask
226 interpolate = image.IsImageInterpolate
227 width = image.GetImageWidth
228 height = image.GetImageHeight
229 out_data_sz = width * height * 3
230
231 puts "Image: width=\"" + width.to_s + "\"" + " height=\"" + height.to_s
232
233 # mtx = image.GetCTM # image matrix (page positioning info)
234
235 # You can use GetImageData to read the raw (decoded) image data
236 #image.GetBitsPerComponent
237 #image.GetImageData # get raw image data
238 # .... or use Image2RGB filter that converts every image to RGB format,
239 # This should save you time since you don't need to deal with color conversions,
240 # image up-sampling, decoding etc.
241
242 img_conv = Image2RGB.new(image) # Extract and convert image to RGB 8-bps format
243 reader = FilterReader.new(img_conv)
244
245 image_data_out = reader.Read(out_data_sz)
246
247 # Note that you don't need to read a whole image at a time. Alternatively
248 # you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
249 # until the function returns 0.
250end
251
252def ProcessElements(reader)
253 element = reader.Next # Read page contents
254 while !element.nil?
255 type = element.GetType
256 case type
257 when Element::E_path # Process path data...
258 ProcessPath(reader, element)
259 when Element::E_text_begin # Process text block...
260 ProcessText(reader)
261 when Element::E_form # Process form XObjects
262 reader.FormBegin
263 ProcessElements(reader)
264 reader.End
265 when Element::E_image # Process Images
266 ProcessImage(element)
267 end
268 element = reader.Next
269 end
270end
271
272 PDFNet.Initialize(PDFTronLicense.Key)
273
274 # Relative path to the folder containing the test files.
275 input_path = "../../TestFiles/"
276 output_path = "../../TestFiles/Output/"
277
278 # Extract text data from all pages in the document
279
280 puts "__________________________________________________"
281 puts "Extract page element information from all "
282 puts "pages in the document."
283
284
285 doc = PDFDoc.new(input_path + "newsletter.pdf")
286 doc.InitSecurityHandler
287 pgnum = doc.GetPageCount
288 page_begin = doc.GetPageIterator
289 page_reader = ElementReader.new
290
291 itr = page_begin
292 while itr.HasNext do # Read every page
293 puts "Page " + itr.Current.GetIndex.to_s + "----------------------------------------"
294 page_reader.Begin(itr.Current)
295 ProcessElements(page_reader)
296 page_reader.End
297 itr.Next
298 end
299 doc.Close
300 PDFNet.Terminate
301 puts "Done."

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales