TextExtract

Sample C# code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Xamarin SDK and PDF Data Extraction SDK Capabilities.

1//
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.Drawing;
7using pdftron;
8using pdftron.Common;
9using pdftron.Filters;
10using pdftron.SDF;
11using pdftron.PDF;
12
13
14using NUnit.Framework;
15
16namespace MiscellaneousSamples
17{
18 // This sample illustrates various text extraction capabilities of PDFNet.
19
20 [TestFixture]
21 public class TextExtractTest
22 {
23
24 [Test]
25 public static void Sample()
26 {
27
28 // Relative path to the folder containing test files.
29 const string input_path = "TestFiles/";
30
31 bool example1_basic = false;
32 bool example2_xml = false;
33 bool example3_wordlist = false;
34 bool example4_advanced = true;
35 bool example5_low_level = false;
36
37 // Sample code showing how to use high-level text extraction APIs.
38 try
39 {
40 using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
41 {
42 doc.InitSecurityHandler();
43
44 Page page = doc.GetPage(1);
45 if (page == null) {
46 Console.WriteLine("Page not found.");
47 return;
48 }
49
50 using (TextExtractor txt = new TextExtractor())
51 {
52 txt.Begin(page); // Read the page.
53 // Other options you may want to consider...
54 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
55 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
56 // ...
57
58 // Example 1. Get all text on the page in a single string.
59 // Words will be separated with space or new line characters.
60 if (example1_basic)
61 {
62 // Get the word count.
63 Console.WriteLine("Word Count: {0}", txt.GetWordCount());
64
65 Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
66 Console.WriteLine("-----------------------------------------------------------");
67 }
68
69 // Example 2. Get XML logical structure for the page.
70 if (example2_xml)
71 {
72 String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
73 Console.WriteLine("\n\n- GetAsXML --------------------------\n{0}", text);
74 Console.WriteLine("-----------------------------------------------------------");
75 }
76
77 // Example 3. Extract words one by one.
78 if (example3_wordlist)
79 {
80 TextExtractor.Word word;
81 for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
82 {
83 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
84 {
85 Console.WriteLine(word.GetString());
86 }
87 }
88 Console.WriteLine("-----------------------------------------------------------");
89 }
90
91 // Example 3. A more advanced text extraction example.
92 // The output is XML structure containing paragraphs, lines, words,
93 // as well as style and positioning information.
94 if (example4_advanced)
95 {
96 Rect bbox;
97 int cur_flow_id=-1, cur_para_id=-1;
98
99 TextExtractor.Line line;
100 TextExtractor.Word word;
101 TextExtractor.Style s, line_style;
102
103 Console.WriteLine("<PDFText>");
104 // For each line on the page...
105 for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
106 {
107 if (line.GetNumWords() == 0)
108 {
109 continue;
110 }
111
112 if (cur_flow_id != line.GetFlowID()) {
113 if (cur_flow_id != -1) {
114 if (cur_para_id != -1) {
115 cur_para_id = -1;
116 Console.WriteLine("</Para>");
117 }
118 Console.WriteLine("</Flow>");
119 }
120 cur_flow_id = line.GetFlowID();
121 Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
122 }
123
124 if (cur_para_id != line.GetParagraphID()) {
125 if (cur_para_id != -1)
126 Console.WriteLine("</Para>");
127 cur_para_id = line.GetParagraphID();
128 Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
129 }
130
131 bbox = line.GetBBox();
132 line_style = line.GetStyle();
133 Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
134 PrintStyle(line_style);
135 Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");
136
137 // For each word in the line...
138 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
139 {
140 // Output the bounding box for the word.
141 bbox = word.GetBBox();
142 Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
143 Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
144 int sz = word.GetStringLen();
145 if (sz == 0) continue;
146
147 // If the word style is different from the parent style, output the new style.
148 s = word.GetStyle();
149 if (s != line_style) {
150 PrintStyle(s);
151 }
152
153 Console.Write(">{0}", word.GetString());
154 Console.WriteLine("</Word>");
155 }
156 Console.WriteLine("</Line>");
157 }
158
159 if (cur_flow_id != -1) {
160 if (cur_para_id != -1) {
161 cur_para_id = -1;
162 Console.WriteLine("</Para>");
163 }
164 Console.WriteLine("</Flow>");
165 }
166 }
167
168 }
169 Console.WriteLine("</PDFText>");
170 }
171 }
172 catch (PDFNetException e)
173 {
174 Console.WriteLine(e.Message);
175 Assert.True(false);
176 }
177
178 // Sample code showing how to use low-level text extraction APIs.
179 if (example5_low_level)
180 {
181 try
182 {
183 LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
184 using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
185 {
186 doc.InitSecurityHandler();
187
188 // Example 1. Extract all text content from the document
189 using (ElementReader reader = new ElementReader())
190 {
191 PageIterator itr = doc.GetPageIterator();
192 //for (; itr.HasNext(); itr.Next()) // Read every page
193 {
194 reader.Begin(itr.Current());
195 LowLevelTextExtractUtils.DumpAllText(reader);
196 reader.End();
197 }
198
199 // Example 2. Extract text based on the selection rectangle.
200 Console.WriteLine("----------------------------------------------------");
201 Console.WriteLine("Extract text based on the selection rectangle.");
202 Console.WriteLine("----------------------------------------------------");
203
204 Page first_page = doc.GetPage(1);
205 string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
206 string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
207 string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
208
209 Console.WriteLine("Field 1: {0}", field1);
210 Console.WriteLine("Field 2: {0}", field2);
211 Console.WriteLine("Field 3: {0}", field3);
212 // ...
213
214 Console.WriteLine("Done.");
215 }
216 }
217 }
218 catch (PDFNetException e)
219 {
220 Console.WriteLine(e.Message);
221 Assert.True(false);
222 }
223 }
224 }
225
226 static void PrintStyle(TextExtractor.Style s) {
227 int[] rgb = s.GetColor();
228 String rgb_hex = String.Format("{0:X02}{1:X02}{2:X02};", rgb[0], rgb[1], rgb[2]);
229 Console.Write(" style=\"font-family:{0}; font-size:{1};{2} color:#{3}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif;" : ""), rgb_hex);
230 }
231 }
232
233 class LowLevelTextExtractUtils
234 {
235 // A utility method used to dump all text content in the
236 // console window.
237 public static void DumpAllText(ElementReader reader)
238 {
239 Element element;
240 while ((element = reader.Next()) != null)
241 {
242 switch (element.GetType())
243 {
244 case Element.Type.e_text_begin:
245 Console.WriteLine("\n--> Text Block Begin");
246 break;
247 case Element.Type.e_text_end:
248 Console.WriteLine("\n--> Text Block End");
249 break;
250 case Element.Type.e_text:
251 {
252 Rect bbox = new Rect();
253 element.GetBBox(bbox);
254 // Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
255
256 String txt = element.GetTextString();
257 Console.Write(txt);
258 Console.WriteLine("");
259 break;
260 }
261 case Element.Type.e_text_new_line:
262 {
263 // Console.WriteLine("\n--> New Line");
264 break;
265 }
266 case Element.Type.e_form: // Process form XObjects
267 {
268 reader.FormBegin();
269 DumpAllText(reader);
270 reader.End();
271 break;
272 }
273 }
274 }
275 }
276
277
278 private string _srch_str;
279
280 // A helper method for ReadTextFromRect
281 void RectTextSearch(ElementReader reader, Rect pos)
282 {
283 Element element;
284 while ((element = reader.Next()) != null)
285 {
286 switch (element.GetType())
287 {
288 case Element.Type.e_text:
289 {
290 Rect bbox = new Rect();
291 element.GetBBox(bbox);
292 if(bbox.IntersectRect(bbox, pos))
293 {
294 _srch_str += element.GetTextString();
295 _srch_str += "\n"; // add a new line?
296 }
297 break;
298 }
299 case Element.Type.e_text_new_line:
300 {
301 break;
302 }
303 case Element.Type.e_form: // Process form XObjects
304 {
305 reader.FormBegin();
306 RectTextSearch(reader, pos);
307 reader.End();
308 break;
309 }
310 }
311 }
312 }
313
314 // A utility method used to extract all text content from
315 // a given selection rectangle. The rectangle coordinates are
316 // expressed in PDF user/page coordinate system.
317 public string ReadTextFromRect(Page page, Rect pos, ElementReader reader)
318 {
319 _srch_str = "";
320 reader.Begin(page);
321 RectTextSearch(reader, pos);
322 reader.End();
323 return _srch_str;
324 }
325 }
326}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales