Extract Text, Read, Parse PDF - TextExtract - Ruby Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12def PrintStyle (style)
13 sans_serif_str = ""
14 if style.IsSerif()
15 sans_serif_str = " sans-serif;"
16 end
17 rgb = style.GetColor
18 rgb_hex = "%02X%02X%02X;" % [rgb[0], rgb[1], rgb[2]]
19 font_str = '%g' % style.GetFontSize
20 print " style=\"font-family:" + style.GetFontName + "; font-size:" + font_str + ";" + sans_serif_str + " color:#" + rgb_hex + "\""
21end
22
23def DumpAllText (reader)
24 element = reader.Next
25 while !element.nil? do
26 case element.GetType
27 when Element::E_text_begin
28 puts "Text Block Begin"
29 when Element::E_text_end
30 puts "Text Block End"
31 when Element::E_text
32 bbox = element.GetBBox
33 puts "BBox: " + bbox.GetX1.to_s + ", " + bbox.GetY1.to_s + ", " +
34 bbox.GetX2.to_s + ", " + bbox.GetY2.to_s
35 puts element.GetTextString
36 when Element::E_text_new_line
37 puts "New Line"
38 when Element::E_form
39 reader.FormBegin
40 DumpAllText(reader)
41 reader.End
42 end
43 element = reader.Next
44 end
45end
46
47# A utility method used to extract all text content from
48# a given selection rectangle. The recnagle coordinates are
49# expressed in PDF user/page coordinate system.
50def ReadTextFromRect (page, pos, reader)
51 reader.Begin(page)
52 srch_str = RectTextSearch(reader, pos)
53 reader.End
54 return srch_str
55end
56
57#A helper method for ReadTextFromRect
58def RectTextSearch (reader, pos)
59 element = reader.Next
60 srch_str2 = ""
61 while !element.nil? do
62 case element.GetType
63 when Element::E_text
64 bbox = element.GetBBox
65 if bbox.IntersectRect(bbox, pos)
66 arr = element.GetTextString
67 srch_str2 += arr
68 srch_str2 += "\n"
69 end
70 when Element::E_text_new_line
71 when Element::E_form
72 reader.FormBegin
73 srch_str2 += RectTextSearch(reader, pos)
74 puts srch_str2
75 reader.End
76 end
77 element = reader.Next
78 end
79 return srch_str2
80end
81
82 PDFNet.Initialize(PDFTronLicense.Key)
83
84 # Relative path to the folder containing test files.
85 input_path = "../../TestFiles/newsletter.pdf"
86 example1_basic = false
87 example2_xml = false
88 example3_wordlist = false
89 example4_advanced = true
90 example5_low_level = false
91
92 # Sample code showing how to use high-level text extraction APIs.
93 doc = PDFDoc.new(input_path)
94 doc.InitSecurityHandler
95
96 page = doc.GetPage(1)
97 if page.nil?
98 print("page no found")
99 end
100
101 txt = TextExtractor.new
102 txt.Begin(page) # Read the page
103
104 # Example 1. Get all text on the page in a single string.
105 # Words will be separated witht space or new line characters.
106 if example1_basic
107 puts "Word count: " + txt.GetWordCount.to_s
108 puts "- GetAsText --------------------------" + txt.GetAsText
109 puts "-----------------------------------------------------------"
110 end
111
112 # Example 2. Get XML logical structure for the page.
113 if example2_xml
114 text = txt.GetAsXML(TextExtractor::E_words_as_elements |
115 TextExtractor::E_output_bbox |
116 TextExtractor::E_output_style_info)
117 puts "- GetAsXML --------------------------" + text
118 puts "-----------------------------------------------------------"
119 end
120
121
122
123 # Example 3. Extract words one by one.
124 if example3_wordlist
125 word = Word.new
126 line = txt.GetFirstLine
127 while line.IsValid do
128 word = line.GetFirstWord
129 while word.IsValid do
130 puts word.GetString
131 word = word.GetNextWord
132 end
133 line = line.GetNextLine
134 end
135 puts "-----------------------------------------------------------"
136 end
137
138
139 # Example 4. A more advanced text extraction example.
140 # The output is XML structure containing paragraphs, lines, words,
141 # as well as style and positioning information.
142 if example4_advanced
143 bbox = Rect.new
144 cur_flow_id = -1
145 cur_para_id = -1
146
147 puts "<PDFText>"
148 # For each line on the page...
149 line = txt.GetFirstLine
150 while line.IsValid do
151 word_num = line.GetNumWords
152 if word_num == 0
153 line = line.GetNextLine
154 next
155 end
156 word = line.GetFirstWord
157 if cur_flow_id != line.GetFlowID
158 if cur_flow_id != -1
159 if cur_para_id != -1
160 cur_para_id = -1
161 puts "</Para>"
162 end
163 puts "</Flow>"
164 end
165 cur_flow_id = line.GetFlowID
166 puts "<Flow id=\"" + cur_flow_id.to_s + "\">"
167 end
168
169 if cur_para_id != line.GetParagraphID
170 if cur_para_id != -1
171 puts "</Para>"
172 end
173 cur_para_id= line.GetParagraphID
174 puts "<Para id=\"" + cur_para_id.to_s + "\">"
175 end
176
177 bbox = line.GetBBox
178 line_style = line.GetStyle
179 print "<Line box=\"%.2f, %.2f, %.2f, %.2f\""% [bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()]
180 PrintStyle (line_style)
181 print " cur_num=\"" + "%d" % line.GetCurrentNum + "\"" + ">\n"
182
183 # For each word in the line...
184 word = line.GetFirstWord
185 while word.IsValid do
186 # Output the bounding box for the word
187 bbox = word.GetBBox
188 print "<Word box=\"%.2f, %.2f, %.2f, %.2f\""% [bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()]
189 print " cur_num=\"" + "%d" % word.GetCurrentNum + "\"";
190 sz = word.GetStringLen
191 if sz == 0
192 word = word.GetNextWord
193 next
194 end
195 # If the word style is different from the parent style, output the new style.
196 s = word.GetStyle
197 if s != line_style
198 PrintStyle (s)
199 end
200 print ">" + word.GetString + "</Word>\n"
201 word = word.GetNextWord
202 end
203 puts "</Line>"
204 line = line.GetNextLine
205 end
206
207 if cur_flow_id != -1
208 if cur_para_id != -1
209 cur_para_id = -1
210 puts "</Para>"
211 end
212 puts "</Flow>"
213 end
214
215 txt.Destroy
216 doc.Close
217 puts "</PDFText>"
218 end
219
220 # Sample code showing how to use low-level text extraction APIs.
221 if example5_low_level
222 doc = PDFDoc.new(input_path)
223 doc.InitSecurityHandler
224
225 # Example 1. Extract all text content from the document
226
227 reader = ElementReader.new
228 itr = doc.GetPageIterator
229 while itr.HasNext do
230 reader.Begin(itr.Current)
231 DumpAllText(reader)
232 reader.End
233 itr.Next
234 end
235
236 # Example 2. Extract text content based on the
237 # selection rectangle.
238
239 puts "----------------------------------------------------"
240 puts "Extract text based on the selection rectangle."
241 puts "----------------------------------------------------"
242
243 itr = doc.GetPageIterator
244 first_page = itr.Current
245 s1 = ReadTextFromRect(first_page, Rect.new(27, 392, 563, 534), reader)
246 puts "Field 1: " + s1
247
248 s1 = ReadTextFromRect(first_page, Rect.new(28, 551, 106, 623), reader);
249 puts "Field 2: " + s1
250
251 s1 = ReadTextFromRect(first_page, Rect.new(208, 550, 387, 621), reader);
252 puts "Field 3: " + s1
253
254 doc.Close
255 puts "Done."
256 end
257 PDFNet.Terminate

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales