TextExtract

Sample C# code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our UWP SDK and PDF Data Extraction SDK Capabilities.
1//
2// Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.IO;
7using System.Threading.Tasks;
8using Windows.Foundation;
9
10using pdftron.PDF;
11using pdftron.SDF;
12
13using PDFNetUniversalSamples.ViewModels;
14
15namespace PDFNetSamples
16{
17    public sealed class TextExtractTest : Sample
18    {
19        public TextExtractTest() :
20            base("TextExtract", "The sample illustrates the basic text extraction capabilities of PDFNet.")
21        {
22        }
23
24        public override IAsyncAction RunAsync()
25        {
26            return Task.Run(new System.Action(() => {
27                WriteLine("--------------------------------");
28                WriteLine("Starting TextExtract Test...");
29                WriteLine("--------------------------------\n");
30                bool example1_basic = true;
31                bool example2_xml = true;
32                bool example3_wordlist = true;
33                bool example4_advanced = false;
34                bool example5_low_level = false;
35
36                // Sample code showing how to use high-level text extraction APIs.
37
38                try
39                {
40                    string input_file_path = Path.Combine(InputPath, "newsletter.pdf");
41                    WriteLine("Opening input file " + input_file_path);
42                    PDFDoc doc = new PDFDoc(input_file_path);
43                    doc.InitSecurityHandler();
44                    pdftron.PDF.Page page = doc.GetPage(1);
45                    if (page == null)
46                    {
47                        WriteLine("Page not found.");
48                        return;
49                    }
50
51                    TextExtractor txt = new TextExtractor();
52                    txt.Begin(page);  // Read the page.
53                    // Other options you may want to consider...
54                    // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
55                    // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
56                    // ...
57
58                    string outputResult = string.Empty;
59
60                    // Example 1. Get all text on the page in a single string.
61                    // Words will be separated with space or new line characters.
62                    if (example1_basic)
63                    {
64                        // Get the word count.
65                        outputResult += string.Format("Word Count: {0}\n", txt.GetWordCount());
66                        outputResult += string.Format("\n- GetAsText --------------------------\n{0}\n", txt.GetAsText());
67                        outputResult += "-----------------------------------------------------------\n";
68                    }
69
70                    // Example 2. Get XML logical structure for the page.
71                    if (example2_xml)
72                    {
73                        String text = txt.GetAsXML(TextExtractorXMLOutputFlags.e_words_as_elements | TextExtractorXMLOutputFlags.e_output_bbox | TextExtractorXMLOutputFlags.e_output_style_info);
74
75                        outputResult += string.Format("\n\n- GetAsXML  --------------------------\n{0}\n", text);
76                        outputResult += "-----------------------------------------------------------\n";
77                    }
78
79                    // Example 3. Extract words one by one.
80                    if (example3_wordlist)
81                    {
82                        TextExtractorWord word;
83                        for (TextExtractorLine line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
84                        {
85                            string wordlist = "";
86                            for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
87                            {
88                                wordlist += " " + word.GetString();
89                            }
90                            outputResult += wordlist + "\n";
91                        }
92                        outputResult += "-----------------------------------------------------------\n";
93                    }
94
95                    // Print result of Example 1, 2 and 3
96                    WriteLine(outputResult);
97
98                    // Example 3. A more advanced text extraction example. 
99                    // The output is XML structure containing paragraphs, lines, words, 
100                    // as well as style and positioning information.
101                    if (example4_advanced)
102                    {
103                        pdftron.PDF.Rect bbox;
104                        int cur_flow_id = -1, cur_para_id = -1;
105
106                        TextExtractorLine line;
107                        TextExtractorWord word;
108                        TextExtractorStyle s, line_style;
109
110                        // For each line on the page...
111                        for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
112                        {
113                            if (line.GetNumWords() == 0)
114                            {
115                                continue;
116                            }
117
118                            if (cur_flow_id != line.GetFlowID())
119                            {
120                                if (cur_flow_id != -1)
121                                {
122                                    if (cur_para_id != -1)
123                                    {
124                                        cur_para_id = -1;
125                                        WriteLine("</Para>");
126                                    }
127                                    WriteLine("</Flow>");
128                                }
129                                cur_flow_id = line.GetFlowID();
130                                WriteLine(string.Format("<Flow id=\"{0}\">", cur_flow_id));
131                            }
132
133                            if (cur_para_id != line.GetParagraphID())
134                            {
135                                if (cur_para_id != -1)
136                                    WriteLine("</Para>");
137                                cur_para_id = line.GetParagraphID();
138                                WriteLine(string.Format("<Para id=\"{0}\">", cur_para_id));
139                            }
140
141                            bbox = line.GetBBox();
142                            line_style = line.GetStyle();
143                            WriteLine(string.Format("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1, bbox.y1, bbox.x2, bbox.y2));
144                            PrintStyle(line_style);
145                            
146
147                            // For each word in the line...
148                            for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
149                            {
150                                // Output the bounding box for the word.
151                                bbox = word.GetBBox();
152                                Write(string.Format("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1, bbox.y1, bbox.x2, bbox.y2));
153
154                                int sz = word.GetStringLen();
155                                if (sz == 0) continue;
156
157                                // If the word style is different from the parent style, output the new style.
158                                s = word.GetStyle();
159                                if (s != line_style)
160                                {
161                                    PrintStyle(s);
162                                }
163
164                                WriteLine(string.Format(">\n{0}", word.GetString()));
165                                WriteLine("</Word>");
166                            }
167                            WriteLine("</Line>");
168                        }
169
170                        if (cur_flow_id != -1)
171                        {
172                            if (cur_para_id != -1)
173                            {
174                                cur_para_id = -1;
175                                WriteLine("</Para>");
176                            }
177                            WriteLine("</Flow>");
178                        }
179                    }
180
181                    doc.Destroy();
182                    WriteLine("Done.");
183                }
184                catch (Exception e)
185                {
186                    WriteLine(GetExceptionMessage(e));
187                }
188
189                // Sample code showing how to use low-level text extraction APIs.
190                if (example5_low_level)
191                {
192                    try
193                    {
194                        LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
195                        PDFDoc doc = new PDFDoc(Path.Combine(InputPath, "newsletter.pdf"));
196                        doc.InitSecurityHandler();
197
198                        // Example 1. Extract all text content from the document
199                        ElementReader reader = new ElementReader();
200                        PageIterator itr = doc.GetPageIterator();
201                        //for (; itr.HasNext(); itr.Next()) //  Read every page
202                        {
203                            reader.Begin(itr.Current());
204                            //LowLevelTextExtractUtils.DumpAllText(reader);
205                            WriteLine(util.DumpAllText(reader, this));
206                            reader.End();
207                        }
208
209                        // Example 2. Extract text based on the selection rectangle.
210                        WriteLine("----------------------------------------------------");
211                        WriteLine("Extract text based on the selection rectangle.");
212                        WriteLine("----------------------------------------------------");
213
214                        pdftron.PDF.Page first_page = doc.GetPage(1);
215                        string field1 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(27, 392, 563, 534), reader);
216                        string field2 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(28, 551, 106, 623), reader);
217                        string field3 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(208, 550, 387, 621), reader);
218
219                        WriteLine(string.Format("Field 1: {0}", field1));
220                        WriteLine(string.Format("Field 2: {0}", field2));
221                        WriteLine(string.Format("Field 3: {0}", field3));
222                        // ... 
223
224
225                        doc.Destroy();
226                        WriteLine("Done.");
227                    }
228                    catch (Exception e)
229                    {
230                        WriteLine(GetExceptionMessage(e));
231                    }
232                }
233
234                WriteLine("\n--------------------------------");
235                WriteLine("Done TextExtract Test.");
236                WriteLine("--------------------------------\n");
237                Flush();
238            })).AsAsyncAction();
239        }
240
241        void PrintStyle(TextExtractorStyle s)
242        {
243            WriteLine(string.Format(" style=\"font-family: {0}; font-size: {1}; {2}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif; " : " ")));
244        }
245    }
246
247    sealed class LowLevelTextExtractUtils
248    {
249        // A utility method used to dump all text content in the 
250        // console window.
251
252        public String DumpAllText(ElementReader reader, Sample sample)
253        {
254            String result = "";
255            Element element;
256            //int i = 0;
257            while ((element = reader.Next()) != null)
258            {
259                switch (element.GetType())
260                {
261                    case ElementType.e_text_begin:
262                        result += ("--> Text Block Begin");
263                        break;
264                    case ElementType.e_text_end:
265                        result += ("--> Text Block End");
266                        break;
267                    case ElementType.e_text:
268                        {
269                            pdftron.PDF.Rect bbox = new pdftron.PDF.Rect();
270                            element.GetBBox(bbox);
271                            result += (string.Format("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2));
272
273                            String txt = element.GetTextString();
274                            sample.WriteLine(txt);
275                            break;
276                        }
277                    case ElementType.e_text_new_line:
278                        {
279                            result += ("--> New Line");
280                            break;
281                        }
282                    case ElementType.e_form: // Process form XObjects
283                        {
284                            reader.FormBegin();
285                            DumpAllText(reader, sample);
286                            reader.End();
287                            break;
288                        }
289                }
290            }
291            return result;
292        }
293
294        string _srch_str;
295
296        // A helper method for ReadTextFromRect
297        void RectTextSearch(ElementReader reader, pdftron.PDF.Rect pos)
298        {
299            Element element;
300            while ((element = reader.Next()) != null)
301            {
302                switch (element.GetType())
303                {
304                    case ElementType.e_text:
305                        {
306                            pdftron.PDF.Rect bbox = new pdftron.PDF.Rect();
307                            element.GetBBox(bbox);
308                            if (bbox.IntersectRect(bbox, pos))
309                            {
310                                _srch_str += element.GetTextString();
311                                _srch_str += "\n"; // add a new line?
312                            }
313                            break;
314                        }
315                    case ElementType.e_text_new_line:
316                        {
317                            break;
318                        }
319                    case ElementType.e_form: // Process form XObjects
320                        {
321                            reader.FormBegin();
322                            RectTextSearch(reader, pos);
323                            reader.End();
324                            break;
325                        }
326                }
327            }
328        }
329
330        // A utility method used to extract all text content from
331        // a given selection rectangle. The rectangle coordinates are
332        // expressed in PDF user/page coordinate system.
333        public string ReadTextFromRect(pdftron.PDF.Page page, pdftron.PDF.Rect pos, ElementReader reader)
334        {
335            _srch_str = "";
336            reader.Begin(page);
337            RectTextSearch(reader, pos);
338            reader.End();
339            return _srch_str;
340        }
341    }
342}
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

TextExtract