Extract Text, Read, Parse PDF - TextExtract

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14
15def printStyle (style):
16 sans_serif_str = ""
17 if style.IsSerif():
18 sans_serif_str = " sans-serif;"
19 rgb = style.GetColor()
20 rgb_hex = "%02X%02X%02X;" % (rgb[0], rgb[1], rgb[2])
21 font_str = '%g' % style.GetFontSize()
22 sys.stdout.write(" style=\"font-family:" + style.GetFontName() + "; font-size:"
23 + font_str + ";" + sans_serif_str + " color:#" + rgb_hex + "\"")
24
25def dumpAllText (reader):
26 element = reader.Next()
27 while element != None:
28 type = element.GetType()
29 if type == Element.e_text_begin:
30 print("Text Block Begin")
31 elif type == Element.e_text_end:
32 print("Text Block End")
33 elif type == Element.e_text:
34 bbox = element.GetBBox()
35 print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
36 + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
37 textString = element.GetTextString()
38 print(textString)
39 elif type == Element.e_text_new_line:
40 print("New Line")
41 elif type == Element.e_form:
42 reader.FormBegin()
43 dumpAllText(reader)
44 reader.End()
45 element = reader.Next()
46
47# A utility method used to extract all text content from
48# a given selection rectangle. The recnagle coordinates are
49# expressed in PDF user/page coordinate system.
50def ReadTextFromRect (page, pos, reader):
51 reader.Begin(page)
52 srch_str = RectTextSearch(reader, pos)
53 reader.End()
54 return srch_str
55
56#A helper method for ReadTextFromRect
57def RectTextSearch (reader, pos):
58 element = reader.Next()
59 srch_str2 = ""
60 while element != None:
61 type = element.GetType()
62 if type == Element.e_text:
63 bbox = element.GetBBox()
64 if (bbox.IntersectRect(bbox, pos)):
65 arr = element.GetTextString()
66 srch_str2 += arr
67 srch_str2 += "\n"
68 elif type == Element.e_text_new_line:
69 None
70 elif type == Element.e_form:
71 reader.FormBegin()
72 srch_str2 += RectTextSearch(reader, pos)
73 print(srch_str2)
74 reader.End()
75 element = reader.Next()
76 return srch_str2
77
78
79def main():
80 PDFNet.Initialize(LicenseKey)
81
82 # Relative path to the folder containing test files.
83 input_path = "../../TestFiles/newsletter.pdf"
84 example1_basic = False
85 example2_xml = False
86 example3_wordlist = False
87 example4_advanced = True
88 example5_low_level = False
89
90 # Sample code showing how to use high-level text extraction APIs.
91 doc = PDFDoc(input_path)
92 doc.InitSecurityHandler()
93
94 page = doc.GetPage(1)
95 if page == None:
96 print("page no found")
97
98 txt = TextExtractor()
99 txt.Begin(page) # Read the page
100
101 # Example 1. Get all text on the page in a single string.
102 # Words will be separated witht space or new line characters.
103 if example1_basic:
104 print("Word count: " + str(txt.GetWordCount()))
105 txtAsText = txt.GetAsText()
106 print("- GetAsText --------------------------" + txtAsText)
107 print("-----------------------------------------------------------")
108
109 # Example 2. Get XML logical structure for the page.
110 if example2_xml:
111 text = txt.GetAsXML(TextExtractor.e_words_as_elements |
112 TextExtractor.e_output_bbox |
113 TextExtractor.e_output_style_info)
114 print("- GetAsXML --------------------------" + text)
115 print("-----------------------------------------------------------")
116
117 # Example 3. Extract words one by one.
118 if example3_wordlist:
119 word = Word()
120 line = txt.GetFirstLine()
121 while line.IsValid():
122 word = line.GetFirstWord()
123 while word.IsValid():
124 wordString = word.GetString()
125 print(wordString)
126 word = word.GetNextWord()
127 line = line.GetNextLine()
128 print("-----------------------------------------------------------")
129
130 # Example 4. A more advanced text extraction example.
131 # The output is XML structure containing paragraphs, lines, words,
132 # as well as style and positioning information.
133 if example4_advanced:
134 bbox = Rect();
135 cur_flow_id = -1
136 cur_para_id = -1
137
138 print("<PDFText>")
139 # For each line on the page...
140 line = txt.GetFirstLine()
141 while line.IsValid():
142 if line.GetNumWords() == 0:
143 line = line.GetNextLine()
144 continue
145 word = line.GetFirstWord()
146 if cur_flow_id != line.GetFlowID():
147 if cur_flow_id != -1:
148 if cur_para_id != -1:
149 cur_para_id = -1;
150 print("</Para>")
151 print("</Flow>")
152 cur_flow_id = line.GetFlowID()
153 print("<Flow id=\"" + str(cur_flow_id) +"\">")
154
155 if cur_para_id != line.GetParagraphID():
156 if cur_para_id != -1:
157 print("</Para>")
158 cur_para_id= line.GetParagraphID()
159 print("<Para id=\"" +str(cur_para_id)+ "\">")
160
161 bbox = line.GetBBox()
162 line_style = line.GetStyle()
163 sys.stdout.write("<Line box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
164 printStyle (line_style)
165 sys.stdout.write(" cur_num=\"" + str(line.GetCurrentNum()) + "\"" + ">\n")
166
167 # For each word in the line...
168 word = line.GetFirstWord()
169 while word.IsValid():
170 # Output the bounding box for the word
171 bbox = word.GetBBox()
172 sys.stdout.write("<Word box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
173 sys.stdout.write(" cur_num=\"" + str(word.GetCurrentNum()) + "\"");
174 sz = word.GetStringLen()
175 if sz == 0:
176 word = word.GetNextWord()
177 continue
178 # If the word style is different from the parent style, output the new style.
179 s = word.GetStyle()
180 if s != line_style:
181 printStyle (s);
182 wordString = word.GetString()
183 sys.stdout.write(">" + wordString + "</Word>\n")
184 word = word.GetNextWord()
185 sys.stdout.write("</Line>\n")
186 line = line.GetNextLine()
187
188 if cur_flow_id != -1:
189 if cur_para_id != -1:
190 cur_para_id = -1
191 sys.stdout.write("</Para>\n")
192 sys.stdout.write("</Flow>\n")
193
194 txt.Destroy()
195 doc.Close()
196 print("</PDFText>")
197
198 # Sample code showing how to use low-level text extraction APIs.
199 if example5_low_level:
200 doc = PDFDoc(input_path)
201 doc.InitSecurityHandler()
202
203 # Example 1. Extract all text content from the document
204
205 reader = ElementReader()
206 itr = doc.GetPageIterator()
207 while itr.HasNext():
208 reader.Begin(itr.Current())
209 dumpAllText(reader)
210 reader.End()
211 itr.Next()
212
213 # Example 2. Extract text content based on the
214 # selection rectangle.
215
216 print("----------------------------------------------------")
217 print("Extract text based on the selection rectangle.")
218 print("----------------------------------------------------")
219
220 itr = doc.GetPageIterator()
221 first_page = itr.Current()
222 s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader)
223 print("Field 1: " + s1)
224
225 s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
226 print("Field 2: " + s1)
227
228 s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
229 print("Field 3: " + s1)
230
231 doc.Close()
232
233 print("Done.")
234 PDFNet.Terminate()
235
236if __name__ == '__main__':
237 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales