TextExtract

Sample C# code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our UWP SDK and PDF Data Extraction SDK Capabilities.

1//
2// Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.IO;
7using System.Threading.Tasks;
8using Windows.Foundation;
9
10using pdftron.PDF;
11using pdftron.SDF;
12
13using PDFNetUniversalSamples.ViewModels;
14
15namespace PDFNetSamples
16{
17 public sealed class TextExtractTest : Sample
18 {
19 public TextExtractTest() :
20 base("TextExtract", "The sample illustrates the basic text extraction capabilities of PDFNet.")
21 {
22 }
23
24 public override IAsyncAction RunAsync()
25 {
26 return Task.Run(new System.Action(() => {
27 WriteLine("--------------------------------");
28 WriteLine("Starting TextExtract Test...");
29 WriteLine("--------------------------------\n");
30 bool example1_basic = true;
31 bool example2_xml = true;
32 bool example3_wordlist = true;
33 bool example4_advanced = false;
34 bool example5_low_level = false;
35
36 // Sample code showing how to use high-level text extraction APIs.
37
38 try
39 {
40 string input_file_path = Path.Combine(InputPath, "newsletter.pdf");
41 WriteLine("Opening input file " + input_file_path);
42 PDFDoc doc = new PDFDoc(input_file_path);
43 doc.InitSecurityHandler();
44 pdftron.PDF.Page page = doc.GetPage(1);
45 if (page == null)
46 {
47 WriteLine("Page not found.");
48 return;
49 }
50
51 TextExtractor txt = new TextExtractor();
52 txt.Begin(page); // Read the page.
53 // Other options you may want to consider...
54 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
55 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
56 // ...
57
58 string outputResult = string.Empty;
59
60 // Example 1. Get all text on the page in a single string.
61 // Words will be separated with space or new line characters.
62 if (example1_basic)
63 {
64 // Get the word count.
65 outputResult += string.Format("Word Count: {0}\n", txt.GetWordCount());
66 outputResult += string.Format("\n- GetAsText --------------------------\n{0}\n", txt.GetAsText());
67 outputResult += "-----------------------------------------------------------\n";
68 }
69
70 // Example 2. Get XML logical structure for the page.
71 if (example2_xml)
72 {
73 String text = txt.GetAsXML(TextExtractorXMLOutputFlags.e_words_as_elements | TextExtractorXMLOutputFlags.e_output_bbox | TextExtractorXMLOutputFlags.e_output_style_info);
74
75 outputResult += string.Format("\n\n- GetAsXML --------------------------\n{0}\n", text);
76 outputResult += "-----------------------------------------------------------\n";
77 }
78
79 // Example 3. Extract words one by one.
80 if (example3_wordlist)
81 {
82 TextExtractorWord word;
83 for (TextExtractorLine line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
84 {
85 string wordlist = "";
86 for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
87 {
88 wordlist += " " + word.GetString();
89 }
90 outputResult += wordlist + "\n";
91 }
92 outputResult += "-----------------------------------------------------------\n";
93 }
94
95 // Print result of Example 1, 2 and 3
96 WriteLine(outputResult);
97
98 // Example 3. A more advanced text extraction example.
99 // The output is XML structure containing paragraphs, lines, words,
100 // as well as style and positioning information.
101 if (example4_advanced)
102 {
103 pdftron.PDF.Rect bbox;
104 int cur_flow_id = -1, cur_para_id = -1;
105
106 TextExtractorLine line;
107 TextExtractorWord word;
108 TextExtractorStyle s, line_style;
109
110 // For each line on the page...
111 for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
112 {
113 if (line.GetNumWords() == 0)
114 {
115 continue;
116 }
117
118 if (cur_flow_id != line.GetFlowID())
119 {
120 if (cur_flow_id != -1)
121 {
122 if (cur_para_id != -1)
123 {
124 cur_para_id = -1;
125 WriteLine("</Para>");
126 }
127 WriteLine("</Flow>");
128 }
129 cur_flow_id = line.GetFlowID();
130 WriteLine(string.Format("<Flow id=\"{0}\">", cur_flow_id));
131 }
132
133 if (cur_para_id != line.GetParagraphID())
134 {
135 if (cur_para_id != -1)
136 WriteLine("</Para>");
137 cur_para_id = line.GetParagraphID();
138 WriteLine(string.Format("<Para id=\"{0}\">", cur_para_id));
139 }
140
141 bbox = line.GetBBox();
142 line_style = line.GetStyle();
143 WriteLine(string.Format("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1, bbox.y1, bbox.x2, bbox.y2));
144 PrintStyle(line_style);
145
146
147 // For each word in the line...
148 for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
149 {
150 // Output the bounding box for the word.
151 bbox = word.GetBBox();
152 Write(string.Format("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1, bbox.y1, bbox.x2, bbox.y2));
153
154 int sz = word.GetStringLen();
155 if (sz == 0) continue;
156
157 // If the word style is different from the parent style, output the new style.
158 s = word.GetStyle();
159 if (s != line_style)
160 {
161 PrintStyle(s);
162 }
163
164 WriteLine(string.Format(">\n{0}", word.GetString()));
165 WriteLine("</Word>");
166 }
167 WriteLine("</Line>");
168 }
169
170 if (cur_flow_id != -1)
171 {
172 if (cur_para_id != -1)
173 {
174 cur_para_id = -1;
175 WriteLine("</Para>");
176 }
177 WriteLine("</Flow>");
178 }
179 }
180
181 doc.Destroy();
182 WriteLine("Done.");
183 }
184 catch (Exception e)
185 {
186 WriteLine(GetExceptionMessage(e));
187 }
188
189 // Sample code showing how to use low-level text extraction APIs.
190 if (example5_low_level)
191 {
192 try
193 {
194 LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
195 PDFDoc doc = new PDFDoc(Path.Combine(InputPath, "newsletter.pdf"));
196 doc.InitSecurityHandler();
197
198 // Example 1. Extract all text content from the document
199 ElementReader reader = new ElementReader();
200 PageIterator itr = doc.GetPageIterator();
201 //for (; itr.HasNext(); itr.Next()) // Read every page
202 {
203 reader.Begin(itr.Current());
204 //LowLevelTextExtractUtils.DumpAllText(reader);
205 WriteLine(util.DumpAllText(reader, this));
206 reader.End();
207 }
208
209 // Example 2. Extract text based on the selection rectangle.
210 WriteLine("----------------------------------------------------");
211 WriteLine("Extract text based on the selection rectangle.");
212 WriteLine("----------------------------------------------------");
213
214 pdftron.PDF.Page first_page = doc.GetPage(1);
215 string field1 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(27, 392, 563, 534), reader);
216 string field2 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(28, 551, 106, 623), reader);
217 string field3 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(208, 550, 387, 621), reader);
218
219 WriteLine(string.Format("Field 1: {0}", field1));
220 WriteLine(string.Format("Field 2: {0}", field2));
221 WriteLine(string.Format("Field 3: {0}", field3));
222 // ...
223
224
225 doc.Destroy();
226 WriteLine("Done.");
227 }
228 catch (Exception e)
229 {
230 WriteLine(GetExceptionMessage(e));
231 }
232 }
233
234 WriteLine("\n--------------------------------");
235 WriteLine("Done TextExtract Test.");
236 WriteLine("--------------------------------\n");
237 Flush();
238 })).AsAsyncAction();
239 }
240
241 void PrintStyle(TextExtractorStyle s)
242 {
243 WriteLine(string.Format(" style=\"font-family: {0}; font-size: {1}; {2}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif; " : " ")));
244 }
245 }
246
247 sealed class LowLevelTextExtractUtils
248 {
249 // A utility method used to dump all text content in the
250 // console window.
251
252 public String DumpAllText(ElementReader reader, Sample sample)
253 {
254 String result = "";
255 Element element;
256 //int i = 0;
257 while ((element = reader.Next()) != null)
258 {
259 switch (element.GetType())
260 {
261 case ElementType.e_text_begin:
262 result += ("--> Text Block Begin");
263 break;
264 case ElementType.e_text_end:
265 result += ("--> Text Block End");
266 break;
267 case ElementType.e_text:
268 {
269 pdftron.PDF.Rect bbox = new pdftron.PDF.Rect();
270 element.GetBBox(bbox);
271 result += (string.Format("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2));
272
273 String txt = element.GetTextString();
274 sample.WriteLine(txt);
275 break;
276 }
277 case ElementType.e_text_new_line:
278 {
279 result += ("--> New Line");
280 break;
281 }
282 case ElementType.e_form: // Process form XObjects
283 {
284 reader.FormBegin();
285 DumpAllText(reader, sample);
286 reader.End();
287 break;
288 }
289 }
290 }
291 return result;
292 }
293
294 string _srch_str;
295
296 // A helper method for ReadTextFromRect
297 void RectTextSearch(ElementReader reader, pdftron.PDF.Rect pos)
298 {
299 Element element;
300 while ((element = reader.Next()) != null)
301 {
302 switch (element.GetType())
303 {
304 case ElementType.e_text:
305 {
306 pdftron.PDF.Rect bbox = new pdftron.PDF.Rect();
307 element.GetBBox(bbox);
308 if (bbox.IntersectRect(bbox, pos))
309 {
310 _srch_str += element.GetTextString();
311 _srch_str += "\n"; // add a new line?
312 }
313 break;
314 }
315 case ElementType.e_text_new_line:
316 {
317 break;
318 }
319 case ElementType.e_form: // Process form XObjects
320 {
321 reader.FormBegin();
322 RectTextSearch(reader, pos);
323 reader.End();
324 break;
325 }
326 }
327 }
328 }
329
330 // A utility method used to extract all text content from
331 // a given selection rectangle. The rectangle coordinates are
332 // expressed in PDF user/page coordinate system.
333 public string ReadTextFromRect(pdftron.PDF.Page page, pdftron.PDF.Rect pos, ElementReader reader)
334 {
335 _srch_str = "";
336 reader.Begin(page);
337 RectTextSearch(reader, pos);
338 reader.End();
339 return _srch_str;
340 }
341 }
342}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales