TextExtract

Sample C# code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Xamarin SDK and PDF Data Extraction SDK Capabilities.
1//
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.Drawing;
7using pdftron;
8using pdftron.Common;
9using pdftron.Filters;
10using pdftron.SDF;
11using pdftron.PDF;
12
13
14using NUnit.Framework;
15
16namespace MiscellaneousSamples
17{
18	// This sample illustrates various text extraction capabilities of PDFNet.
19
20	[TestFixture]
21	public class TextExtractTest
22	{		
23		
24		[Test]
25		public static void Sample()
26		{
27
28			// Relative path to the folder containing test files.
29			const string input_path =  "TestFiles/";
30
31			bool example1_basic     = false;
32			bool example2_xml       = false;
33			bool example3_wordlist  = false;
34			bool example4_advanced  = true;
35			bool example5_low_level = false;
36
37			// Sample code showing how to use high-level text extraction APIs.
38			try	
39			{
40				using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
41				{
42					doc.InitSecurityHandler();
43
44					Page page = doc.GetPage(1);
45					if (page == null) {
46						Console.WriteLine("Page not found.");
47						return;
48					}
49
50					using (TextExtractor txt = new TextExtractor())
51					{
52						txt.Begin(page);  // Read the page.
53						// Other options you may want to consider...
54						// txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
55						// txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
56						// ...
57
58						// Example 1. Get all text on the page in a single string.
59						// Words will be separated with space or new line characters.
60						if (example1_basic) 
61						{
62							// Get the word count.
63							Console.WriteLine("Word Count: {0}", txt.GetWordCount());
64						
65							Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
66							Console.WriteLine("-----------------------------------------------------------");
67						}
68
69						// Example 2. Get XML logical structure for the page.
70						if (example2_xml) 
71						{
72							String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
73							Console.WriteLine("\n\n- GetAsXML  --------------------------\n{0}", text);
74							Console.WriteLine("-----------------------------------------------------------");
75						}
76
77						// Example 3. Extract words one by one.
78						if (example3_wordlist) 
79						{
80							TextExtractor.Word word;
81							for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())	
82							{
83								for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord()) 
84								{
85									Console.WriteLine(word.GetString());
86								}
87							}
88							Console.WriteLine("-----------------------------------------------------------");
89						}
90
91						// Example 3. A more advanced text extraction example. 
92						// The output is XML structure containing paragraphs, lines, words, 
93						// as well as style and positioning information.
94						if (example4_advanced) 
95						{
96							Rect bbox;
97							int cur_flow_id=-1, cur_para_id=-1;
98
99							TextExtractor.Line line;
100							TextExtractor.Word word;
101							TextExtractor.Style s, line_style;
102
103                            Console.WriteLine("<PDFText>");
104							// For each line on the page...
105							for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
106							{
107								if (line.GetNumWords() == 0)
108								{
109									continue;
110								}
111
112								if (cur_flow_id != line.GetFlowID()) {
113									if (cur_flow_id != -1) {
114										if (cur_para_id != -1) {
115											cur_para_id = -1;
116											Console.WriteLine("</Para>");
117										}
118										Console.WriteLine("</Flow>");
119									}
120									cur_flow_id = line.GetFlowID();
121									Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
122								}
123
124								if (cur_para_id != line.GetParagraphID()) {
125									if (cur_para_id != -1)
126										Console.WriteLine("</Para>");
127									cur_para_id = line.GetParagraphID();
128									Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
129								}	
130
131								bbox = line.GetBBox();
132								line_style = line.GetStyle();
133								Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
134								PrintStyle(line_style);
135                                Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");
136
137								// For each word in the line...
138								for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
139								{
140									// Output the bounding box for the word.
141									bbox = word.GetBBox();
142									Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
143                                    Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
144									int sz = word.GetStringLen();
145									if (sz == 0) continue;
146
147									// If the word style is different from the parent style, output the new style.
148									s = word.GetStyle();
149									if (s != line_style) {
150										PrintStyle(s);
151									}
152
153									Console.Write(">{0}", word.GetString());
154									Console.WriteLine("</Word>");
155								}
156								Console.WriteLine("</Line>");
157							}
158
159							if (cur_flow_id != -1) {
160								if (cur_para_id != -1) {
161									cur_para_id = -1;
162									Console.WriteLine("</Para>");
163								}
164								Console.WriteLine("</Flow>");
165							}
166						}
167
168					}
169					Console.WriteLine("</PDFText>");
170				}
171			}
172			catch (PDFNetException e)
173			{
174				Console.WriteLine(e.Message);
175				Assert.True(false);
176			}
177
178			// Sample code showing how to use low-level text extraction APIs.
179			if (example5_low_level)
180			{
181				try	
182				{
183					LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
184					using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
185					{
186						doc.InitSecurityHandler();
187
188						// Example 1. Extract all text content from the document
189						using (ElementReader reader = new ElementReader())
190						{
191							PageIterator itr = doc.GetPageIterator();
192							//for (; itr.HasNext(); itr.Next()) //  Read every page
193							{				
194								reader.Begin(itr.Current());
195								LowLevelTextExtractUtils.DumpAllText(reader);
196								reader.End();
197							}
198
199							// Example 2. Extract text based on the selection rectangle.
200							Console.WriteLine("----------------------------------------------------");
201							Console.WriteLine("Extract text based on the selection rectangle.");
202							Console.WriteLine("----------------------------------------------------");
203
204							Page first_page = doc.GetPage(1);
205							string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
206							string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
207							string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
208
209							Console.WriteLine("Field 1: {0}", field1);
210							Console.WriteLine("Field 2: {0}", field2);
211							Console.WriteLine("Field 3: {0}", field3);
212							// ... 
213
214							Console.WriteLine("Done.");
215						}
216					}
217				}
218				catch (PDFNetException e)
219				{
220					Console.WriteLine(e.Message);
221					Assert.True(false);
222				}	
223			}
224		}
225
226		static void PrintStyle(TextExtractor.Style s) {
227            int[] rgb = s.GetColor();
228            String rgb_hex = String.Format("{0:X02}{1:X02}{2:X02};", rgb[0], rgb[1], rgb[2]);
229            Console.Write(" style=\"font-family:{0}; font-size:{1};{2} color:#{3}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif;" : ""), rgb_hex); 
230		}
231	}
232
233	class LowLevelTextExtractUtils
234	{
235		// A utility method used to dump all text content in the 
236		// console window.
237		public static void DumpAllText(ElementReader reader) 
238		{
239			Element element; 
240			while ((element = reader.Next()) != null)
241			{
242				switch (element.GetType()) 
243				{
244					case Element.Type.e_text_begin:
245						Console.WriteLine("\n--> Text Block Begin");
246						break;
247					case Element.Type.e_text_end:
248						Console.WriteLine("\n--> Text Block End");
249						break;
250					case Element.Type.e_text:
251					{
252						Rect bbox = new Rect();
253						element.GetBBox(bbox);
254						// Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
255
256						String txt = element.GetTextString();
257						Console.Write(txt);
258						Console.WriteLine("");
259						break;
260					}
261					case Element.Type.e_text_new_line:
262					{
263						// Console.WriteLine("\n--> New Line");
264						break;
265					}
266					case Element.Type.e_form: // Process form XObjects
267					{
268						reader.FormBegin(); 
269						DumpAllText(reader);
270						reader.End(); 
271						break; 
272					}
273				}
274			}
275		}
276
277
278		private string _srch_str;
279
280		// A helper method for ReadTextFromRect
281		void RectTextSearch(ElementReader reader, Rect pos) 
282		{			
283			Element element; 
284			while ((element = reader.Next()) != null)
285			{
286				switch (element.GetType()) 
287				{
288					case Element.Type.e_text:
289					{
290						Rect bbox = new Rect();
291						element.GetBBox(bbox);
292						if(bbox.IntersectRect(bbox, pos))
293						{
294							_srch_str += element.GetTextString();
295							_srch_str += "\n"; // add a new line?
296						}
297						break;
298					}
299					case Element.Type.e_text_new_line:
300					{
301						break;
302					}
303					case Element.Type.e_form: // Process form XObjects
304					{
305						reader.FormBegin(); 
306						RectTextSearch(reader, pos);
307						reader.End(); 
308						break; 
309					}
310				}
311			}
312		}
313
314		// A utility method used to extract all text content from
315		// a given selection rectangle. The rectangle coordinates are
316		// expressed in PDF user/page coordinate system.
317		public string ReadTextFromRect(Page page, Rect pos, ElementReader reader)
318		{
319			_srch_str = "";
320			reader.Begin(page);
321			RectTextSearch(reader, pos);
322			reader.End();
323			return _srch_str;
324		}
325	}
326}
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

TextExtract