TextExtract

Sample C# code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using System.Drawing;
7using pdftron;
8using pdftron.Common;
9using pdftron.Filters;
10using pdftron.SDF;
11using pdftron.PDF;
12
13
14namespace TextExtractTestCS
15{
16 // This sample illustrates various text extraction capabilities of PDFNet.
17
18 class Class1
19 {
20 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
21 static Class1() {}
22
23 static void Main(string[] args)
24 {
25 PDFNet.Initialize(PDFTronLicense.Key);
26
27 // Relative path to the folder containing test files.
28 string input_path = "../../../../TestFiles/";
29
30 bool example1_basic = false;
31 bool example2_xml = false;
32 bool example3_wordlist = false;
33 bool example4_advanced = true;
34 bool example5_low_level = false;
35
36 // Sample code showing how to use high-level text extraction APIs.
37 try
38 {
39 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
40 {
41 doc.InitSecurityHandler();
42
43 Page page = doc.GetPage(1);
44 if (page == null) {
45 Console.WriteLine("Page not found.");
46 return;
47 }
48
49 using (TextExtractor txt = new TextExtractor())
50 {
51 txt.Begin(page); // Read the page.
52 // Other options you may want to consider...
53 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
54 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
55 // ...
56
57 // Example 1. Get all text on the page in a single string.
58 // Words will be separated with space or new line characters.
59 if (example1_basic)
60 {
61 // Get the word count.
62 Console.WriteLine("Word Count: {0}", txt.GetWordCount());
63
64 Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
65 Console.WriteLine("-----------------------------------------------------------");
66 }
67
68 // Example 2. Get XML logical structure for the page.
69 if (example2_xml)
70 {
71 String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
72 Console.WriteLine("\n\n- GetAsXML --------------------------\n{0}", text);
73 Console.WriteLine("-----------------------------------------------------------");
74 }
75
76 // Example 3. Extract words one by one.
77 if (example3_wordlist)
78 {
79 TextExtractor.Word word;
80 for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
81 {
82 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
83 {
84 Console.WriteLine(word.GetString());
85 }
86 }
87 Console.WriteLine("-----------------------------------------------------------");
88 }
89
90 // Example 3. A more advanced text extraction example.
91 // The output is XML structure containing paragraphs, lines, words,
92 // as well as style and positioning information.
93 if (example4_advanced)
94 {
95 Rect bbox;
96 int cur_flow_id=-1, cur_para_id=-1;
97
98 TextExtractor.Line line;
99 TextExtractor.Word word;
100 TextExtractor.Style s, line_style;
101
102 Console.WriteLine("<PDFText>");
103 // For each line on the page...
104 for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
105 {
106 if (line.GetNumWords() == 0)
107 {
108 continue;
109 }
110
111 if (cur_flow_id != line.GetFlowID()) {
112 if (cur_flow_id != -1) {
113 if (cur_para_id != -1) {
114 cur_para_id = -1;
115 Console.WriteLine("</Para>");
116 }
117 Console.WriteLine("</Flow>");
118 }
119 cur_flow_id = line.GetFlowID();
120 Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
121 }
122
123 if (cur_para_id != line.GetParagraphID()) {
124 if (cur_para_id != -1)
125 Console.WriteLine("</Para>");
126 cur_para_id = line.GetParagraphID();
127 Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
128 }
129
130 bbox = line.GetBBox();
131 line_style = line.GetStyle();
132 Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
133 PrintStyle(line_style);
134 Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");
135
136 // For each word in the line...
137 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
138 {
139 // Output the bounding box for the word.
140 bbox = word.GetBBox();
141 Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
142 Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
143 int sz = word.GetStringLen();
144 if (sz == 0) continue;
145
146 // If the word style is different from the parent style, output the new style.
147 s = word.GetStyle();
148 if (s != line_style) {
149 PrintStyle(s);
150 }
151
152 Console.Write(">{0}", word.GetString());
153 Console.WriteLine("</Word>");
154 }
155 Console.WriteLine("</Line>");
156 }
157
158 if (cur_flow_id != -1) {
159 if (cur_para_id != -1) {
160 cur_para_id = -1;
161 Console.WriteLine("</Para>");
162 }
163 Console.WriteLine("</Flow>");
164 }
165 }
166
167 }
168 Console.WriteLine("</PDFText>");
169 }
170 }
171 catch (PDFNetException e)
172 {
173 Console.WriteLine(e.Message);
174 }
175
176 // Sample code showing how to use low-level text extraction APIs.
177 if (example5_low_level)
178 {
179 try
180 {
181 LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
182 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
183 {
184 doc.InitSecurityHandler();
185
186 // Example 1. Extract all text content from the document
187 using (ElementReader reader = new ElementReader())
188 {
189 PageIterator itr = doc.GetPageIterator();
190 //for (; itr.HasNext(); itr.Next()) // Read every page
191 {
192 reader.Begin(itr.Current());
193 LowLevelTextExtractUtils.DumpAllText(reader);
194 reader.End();
195 }
196
197 // Example 2. Extract text based on the selection rectangle.
198 Console.WriteLine("----------------------------------------------------");
199 Console.WriteLine("Extract text based on the selection rectangle.");
200 Console.WriteLine("----------------------------------------------------");
201
202 Page first_page = doc.GetPage(1);
203 string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
204 string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
205 string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
206
207 Console.WriteLine("Field 1: {0}", field1);
208 Console.WriteLine("Field 2: {0}", field2);
209 Console.WriteLine("Field 3: {0}", field3);
210 // ...
211
212 Console.WriteLine("Done.");
213 }
214 }
215 }
216 catch (PDFNetException e)
217 {
218 Console.WriteLine(e.Message);
219 }
220 }
221 PDFNet.Terminate();
222 }
223
224 static void PrintStyle(TextExtractor.Style s) {
225 Color rgb = s.GetColor();
226 String rgb_hex = String.Format("{0:X02}{1:X02}{2:X02};", rgb.R, rgb.G, rgb.B);
227 Console.Write(" style=\"font-family:{0}; font-size:{1};{2} color:#{3}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif;" : ""), rgb_hex);
228 }
229 }
230
231 class LowLevelTextExtractUtils
232 {
233 // A utility method used to dump all text content in the
234 // console window.
235 public static void DumpAllText(ElementReader reader)
236 {
237 Element element;
238 while ((element = reader.Next()) != null)
239 {
240 switch (element.GetType())
241 {
242 case Element.Type.e_text_begin:
243 Console.WriteLine("\n--> Text Block Begin");
244 break;
245 case Element.Type.e_text_end:
246 Console.WriteLine("\n--> Text Block End");
247 break;
248 case Element.Type.e_text:
249 {
250 Rect bbox = new Rect();
251 element.GetBBox(bbox);
252 // Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
253
254 String txt = element.GetTextString();
255 Console.Write(txt);
256 Console.WriteLine("");
257 break;
258 }
259 case Element.Type.e_text_new_line:
260 {
261 // Console.WriteLine("\n--> New Line");
262 break;
263 }
264 case Element.Type.e_form: // Process form XObjects
265 {
266 reader.FormBegin();
267 DumpAllText(reader);
268 reader.End();
269 break;
270 }
271 }
272 }
273 }
274
275
276 private string _srch_str;
277
278 // A helper method for ReadTextFromRect
279 void RectTextSearch(ElementReader reader, Rect pos)
280 {
281 Element element;
282 while ((element = reader.Next()) != null)
283 {
284 switch (element.GetType())
285 {
286 case Element.Type.e_text:
287 {
288 Rect bbox = new Rect();
289 element.GetBBox(bbox);
290 if(bbox.IntersectRect(bbox, pos))
291 {
292 _srch_str += element.GetTextString();
293 _srch_str += "\n"; // add a new line?
294 }
295 break;
296 }
297 case Element.Type.e_text_new_line:
298 {
299 break;
300 }
301 case Element.Type.e_form: // Process form XObjects
302 {
303 reader.FormBegin();
304 RectTextSearch(reader, pos);
305 reader.End();
306 break;
307 }
308 }
309 }
310 }
311
312 // A utility method used to extract all text content from
313 // a given selection rectangle. The rectangle coordinates are
314 // expressed in PDF user/page coordinate system.
315 public string ReadTextFromRect(Page page, Rect pos, ElementReader reader)
316 {
317 _srch_str = "";
318 reader.Begin(page);
319 RectTextSearch(reader, pos);
320 reader.End();
321 return _srch_str;
322 }
323 }
324}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales