TextExtract

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB.. If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using System.Drawing;
7using pdftron;
8using pdftron.Common;
9using pdftron.Filters;
10using pdftron.SDF;
11using pdftron.PDF;
12
13
14namespace TextExtractTestCS
15{
16 // This sample illustrates various text extraction capabilities of PDFNet.
17
18 class Class1
19 {
20 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
21 static Class1() {}
22
23 static void Main(string[] args)
24 {
25 PDFNet.Initialize(PDFTronLicense.Key);
26
27 // Relative path to the folder containing test files.
28 string input_path = "../../../../TestFiles/";
29
30 bool example1_basic = false;
31 bool example2_xml = false;
32 bool example3_wordlist = false;
33 bool example4_advanced = true;
34 bool example5_low_level = false;
35
36 // Sample code showing how to use high-level text extraction APIs.
37 try
38 {
39 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
40 {
41 doc.InitSecurityHandler();
42
43 Page page = doc.GetPage(1);
44 if (page == null) {
45 Console.WriteLine("Page not found.");
46 return;
47 }
48
49 using (TextExtractor txt = new TextExtractor())
50 {
51 txt.Begin(page); // Read the page.
52 // Other options you may want to consider...
53 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
54 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
55 // ...
56
57 // Example 1. Get all text on the page in a single string.
58 // Words will be separated with space or new line characters.
59 if (example1_basic)
60 {
61 // Get the word count.
62 Console.WriteLine("Word Count: {0}", txt.GetWordCount());
63
64 Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
65 Console.WriteLine("-----------------------------------------------------------");
66 }
67
68 // Example 2. Get XML logical structure for the page.
69 if (example2_xml)
70 {
71 String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
72 Console.WriteLine("\n\n- GetAsXML --------------------------\n{0}", text);
73 Console.WriteLine("-----------------------------------------------------------");
74 }
75
76 // Example 3. Extract words one by one.
77 if (example3_wordlist)
78 {
79 TextExtractor.Word word;
80 for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
81 {
82 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
83 {
84 Console.WriteLine(word.GetString());
85 }
86 }
87 Console.WriteLine("-----------------------------------------------------------");
88 }
89
90 // Example 3. A more advanced text extraction example.
91 // The output is XML structure containing paragraphs, lines, words,
92 // as well as style and positioning information.
93 if (example4_advanced)
94 {
95 Rect bbox;
96 int cur_flow_id=-1, cur_para_id=-1;
97
98 TextExtractor.Line line;
99 TextExtractor.Word word;
100 TextExtractor.Style s, line_style;
101
102 Console.WriteLine("<PDFText>");
103 // For each line on the page...
104 for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
105 {
106 if (line.GetNumWords() == 0)
107 {
108 continue;
109 }
110
111 if (cur_flow_id != line.GetFlowID()) {
112 if (cur_flow_id != -1) {
113 if (cur_para_id != -1) {
114 cur_para_id = -1;
115 Console.WriteLine("</Para>");
116 }
117 Console.WriteLine("</Flow>");
118 }
119 cur_flow_id = line.GetFlowID();
120 Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
121 }
122
123 if (cur_para_id != line.GetParagraphID()) {
124 if (cur_para_id != -1)
125 Console.WriteLine("</Para>");
126 cur_para_id = line.GetParagraphID();
127 Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
128 }
129
130 bbox = line.GetBBox();
131 line_style = line.GetStyle();
132 Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
133 PrintStyle(line_style);
134 Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");
135
136 // For each word in the line...
137 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
138 {
139 // Output the bounding box for the word.
140 bbox = word.GetBBox();
141 Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
142 Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
143 int sz = word.GetStringLen();
144 if (sz == 0) continue;
145
146 // If the word style is different from the parent style, output the new style.
147 s = word.GetStyle();
148 if (s != line_style) {
149 PrintStyle(s);
150 }
151
152 Console.Write(">{0}", word.GetString());
153 Console.WriteLine("</Word>");
154 }
155 Console.WriteLine("</Line>");
156 }
157
158 if (cur_flow_id != -1) {
159 if (cur_para_id != -1) {
160 cur_para_id = -1;
161 Console.WriteLine("</Para>");
162 }
163 Console.WriteLine("</Flow>");
164 }
165 }
166
167 }
168 Console.WriteLine("</PDFText>");
169 }
170 }
171 catch (PDFNetException e)
172 {
173 Console.WriteLine(e.Message);
174 }
175
176 // Sample code showing how to use low-level text extraction APIs.
177 if (example5_low_level)
178 {
179 try
180 {
181 LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
182 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
183 {
184 doc.InitSecurityHandler();
185
186 // Example 1. Extract all text content from the document
187 using (ElementReader reader = new ElementReader())
188 {
189 PageIterator itr = doc.GetPageIterator();
190 //for (; itr.HasNext(); itr.Next()) // Read every page
191 {
192 reader.Begin(itr.Current());
193 LowLevelTextExtractUtils.DumpAllText(reader);
194 reader.End();
195 }
196
197 // Example 2. Extract text based on the selection rectangle.
198 Console.WriteLine("----------------------------------------------------");
199 Console.WriteLine("Extract text based on the selection rectangle.");
200 Console.WriteLine("----------------------------------------------------");
201
202 Page first_page = doc.GetPage(1);
203 string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
204 string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
205 string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
206
207 Console.WriteLine("Field 1: {0}", field1);
208 Console.WriteLine("Field 2: {0}", field2);
209 Console.WriteLine("Field 3: {0}", field3);
210 // ...
211
212 Console.WriteLine("Done.");
213 }
214 }
215 }
216 catch (PDFNetException e)
217 {
218 Console.WriteLine(e.Message);
219 }
220 }
221 PDFNet.Terminate();
222 }
223
224 static void PrintStyle(TextExtractor.Style s) {
225 Color rgb = s.GetColor();
226 String rgb_hex = String.Format("{0:X02}{1:X02}{2:X02};", rgb.R, rgb.G, rgb.B);
227 Console.Write(" style=\"font-family:{0}; font-size:{1};{2} color:#{3}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif;" : ""), rgb_hex);
228 }
229 }
230
231 class LowLevelTextExtractUtils
232 {
233 // A utility method used to dump all text content in the
234 // console window.
235 public static void DumpAllText(ElementReader reader)
236 {
237 Element element;
238 while ((element = reader.Next()) != null)
239 {
240 switch (element.GetType())
241 {
242 case Element.Type.e_text_begin:
243 Console.WriteLine("\n--> Text Block Begin");
244 break;
245 case Element.Type.e_text_end:
246 Console.WriteLine("\n--> Text Block End");
247 break;
248 case Element.Type.e_text:
249 {
250 Rect bbox = new Rect();
251 element.GetBBox(bbox);
252 // Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
253
254 String txt = element.GetTextString();
255 Console.Write(txt);
256 Console.WriteLine("");
257 break;
258 }
259 case Element.Type.e_text_new_line:
260 {
261 // Console.WriteLine("\n--> New Line");
262 break;
263 }
264 case Element.Type.e_form: // Process form XObjects
265 {
266 reader.FormBegin();
267 DumpAllText(reader);
268 reader.End();
269 break;
270 }
271 }
272 }
273 }
274
275
276 private string _srch_str;
277
278 // A helper method for ReadTextFromRect
279 void RectTextSearch(ElementReader reader, Rect pos)
280 {
281 Element element;
282 while ((element = reader.Next()) != null)
283 {
284 switch (element.GetType())
285 {
286 case Element.Type.e_text:
287 {
288 Rect bbox = new Rect();
289 element.GetBBox(bbox);
290 if(bbox.IntersectRect(bbox, pos))
291 {
292 _srch_str += element.GetTextString();
293 _srch_str += "\n"; // add a new line?
294 }
295 break;
296 }
297 case Element.Type.e_text_new_line:
298 {
299 break;
300 }
301 case Element.Type.e_form: // Process form XObjects
302 {
303 reader.FormBegin();
304 RectTextSearch(reader, pos);
305 reader.End();
306 break;
307 }
308 }
309 }
310 }
311
312 // A utility method used to extract all text content from
313 // a given selection rectangle. The rectangle coordinates are
314 // expressed in PDF user/page coordinate system.
315 public string ReadTextFromRect(Page page, Rect pos, ElementReader reader)
316 {
317 _srch_str = "";
318 reader.Begin(page);
319 RectTextSearch(reader, pos);
320 reader.End();
321 return _srch_str;
322 }
323 }
324}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales