Extract Text, Read, Parse PDF - TextExtract - Python Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14
15def printStyle (style):
16 sans_serif_str = ""
17 if style.IsSerif():
18 sans_serif_str = " sans-serif;"
19 rgb = style.GetColor()
20 rgb_hex = "%02X%02X%02X;" % (rgb[0], rgb[1], rgb[2])
21 font_str = '%g' % style.GetFontSize()
22 sys.stdout.write(" style=\"font-family:" + style.GetFontName() + "; font-size:"
23 + font_str + ";" + sans_serif_str + " color:#" + rgb_hex + "\"")
24
25def dumpAllText (reader):
26 element = reader.Next()
27 while element != None:
28 type = element.GetType()
29 if type == Element.e_text_begin:
30 print("Text Block Begin")
31 elif type == Element.e_text_end:
32 print("Text Block End")
33 elif type == Element.e_text:
34 bbox = element.GetBBox()
35 print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
36 + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
37 textString = element.GetTextString()
38 print(textString)
39 elif type == Element.e_text_new_line:
40 print("New Line")
41 elif type == Element.e_form:
42 reader.FormBegin()
43 dumpAllText(reader)
44 reader.End()
45 element = reader.Next()
46
47# A utility method used to extract all text content from
48# a given selection rectangle. The recnagle coordinates are
49# expressed in PDF user/page coordinate system.
50def ReadTextFromRect (page, pos, reader):
51 reader.Begin(page)
52 srch_str = RectTextSearch(reader, pos)
53 reader.End()
54 return srch_str
55
56#A helper method for ReadTextFromRect
57def RectTextSearch (reader, pos):
58 element = reader.Next()
59 srch_str2 = ""
60 while element != None:
61 type = element.GetType()
62 if type == Element.e_text:
63 bbox = element.GetBBox()
64 if (bbox.IntersectRect(bbox, pos)):
65 arr = element.GetTextString()
66 srch_str2 += arr
67 srch_str2 += "\n"
68 elif type == Element.e_text_new_line:
69 None
70 elif type == Element.e_form:
71 reader.FormBegin()
72 srch_str2 += RectTextSearch(reader, pos)
73 print(srch_str2)
74 reader.End()
75 element = reader.Next()
76 return srch_str2
77
78
79def main():
80 PDFNet.Initialize(LicenseKey)
81
82 # Relative path to the folder containing test files.
83 input_path = "../../TestFiles/newsletter.pdf"
84 example1_basic = False
85 example2_xml = False
86 example3_wordlist = False
87 example4_advanced = True
88 example5_low_level = False
89
90 # Sample code showing how to use high-level text extraction APIs.
91 doc = PDFDoc(input_path)
92 doc.InitSecurityHandler()
93
94 page = doc.GetPage(1)
95 if page == None:
96 print("page no found")
97
98 txt = TextExtractor()
99 txt.Begin(page) # Read the page
100
101 # Example 1. Get all text on the page in a single string.
102 # Words will be separated witht space or new line characters.
103 if example1_basic:
104 print("Word count: " + str(txt.GetWordCount()))
105 txtAsText = txt.GetAsText()
106 print("- GetAsText --------------------------" + txtAsText)
107 print("-----------------------------------------------------------")
108
109 # Example 2. Get XML logical structure for the page.
110 if example2_xml:
111 text = txt.GetAsXML(TextExtractor.e_words_as_elements |
112 TextExtractor.e_output_bbox |
113 TextExtractor.e_output_style_info)
114 print("- GetAsXML --------------------------" + text)
115 print("-----------------------------------------------------------")
116
117 # Example 3. Extract words one by one.
118 if example3_wordlist:
119 word = Word()
120 line = txt.GetFirstLine()
121 while line.IsValid():
122 word = line.GetFirstWord()
123 while word.IsValid():
124 wordString = word.GetString()
125 print(wordString)
126 word = word.GetNextWord()
127 line = line.GetNextLine()
128 print("-----------------------------------------------------------")
129
130 # Example 4. A more advanced text extraction example.
131 # The output is XML structure containing paragraphs, lines, words,
132 # as well as style and positioning information.
133 if example4_advanced:
134 bbox = Rect();
135 cur_flow_id = -1
136 cur_para_id = -1
137
138 print("<PDFText>")
139 # For each line on the page...
140 line = txt.GetFirstLine()
141 while line.IsValid():
142 if line.GetNumWords() == 0:
143 line = line.GetNextLine()
144 continue
145 word = line.GetFirstWord()
146 if cur_flow_id != line.GetFlowID():
147 if cur_flow_id != -1:
148 if cur_para_id != -1:
149 cur_para_id = -1;
150 print("</Para>")
151 print("</Flow>")
152 cur_flow_id = line.GetFlowID()
153 print("<Flow id=\"" + str(cur_flow_id) +"\">")
154
155 if cur_para_id != line.GetParagraphID():
156 if cur_para_id != -1:
157 print("</Para>")
158 cur_para_id= line.GetParagraphID()
159 print("<Para id=\"" +str(cur_para_id)+ "\">")
160
161 bbox = line.GetBBox()
162 line_style = line.GetStyle()
163 sys.stdout.write("<Line box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
164 printStyle (line_style)
165 sys.stdout.write(" cur_num=\"" + str(line.GetCurrentNum()) + "\"" + ">\n")
166
167 # For each word in the line...
168 word = line.GetFirstWord()
169 while word.IsValid():
170 # Output the bounding box for the word
171 bbox = word.GetBBox()
172 sys.stdout.write("<Word box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
173 sys.stdout.write(" cur_num=\"" + str(word.GetCurrentNum()) + "\"");
174 sz = word.GetStringLen()
175 if sz == 0:
176 word = word.GetNextWord()
177 continue
178 # If the word style is different from the parent style, output the new style.
179 s = word.GetStyle()
180 if s != line_style:
181 printStyle (s);
182 wordString = word.GetString()
183 sys.stdout.write(">" + wordString + "</Word>\n")
184 word = word.GetNextWord()
185 sys.stdout.write("</Line>\n")
186 line = line.GetNextLine()
187
188 if cur_flow_id != -1:
189 if cur_para_id != -1:
190 cur_para_id = -1
191 sys.stdout.write("</Para>\n")
192 sys.stdout.write("</Flow>\n")
193
194 txt.Destroy()
195 doc.Close()
196 print("</PDFText>")
197
198 # Sample code showing how to use low-level text extraction APIs.
199 if example5_low_level:
200 doc = PDFDoc(input_path)
201 doc.InitSecurityHandler()
202
203 # Example 1. Extract all text content from the document
204
205 reader = ElementReader()
206 itr = doc.GetPageIterator()
207 while itr.HasNext():
208 reader.Begin(itr.Current())
209 dumpAllText(reader)
210 reader.End()
211 itr.Next()
212
213 # Example 2. Extract text content based on the
214 # selection rectangle.
215
216 print("----------------------------------------------------")
217 print("Extract text based on the selection rectangle.")
218 print("----------------------------------------------------")
219
220 itr = doc.GetPageIterator()
221 first_page = itr.Current()
222 s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader)
223 print("Field 1: " + s1)
224
225 s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
226 print("Field 2: " + s1)
227
228 s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
229 print("Field 3: " + s1)
230
231 doc.Close()
232
233 print("Done.")
234 PDFNet.Terminate()
235
236if __name__ == '__main__':
237 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales