Some test text!

Discord Logo

Chat with us

PDFTron is now Apryse, learn more here.

Read a PDF file in C# (parse & extract text)

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample C# code for using PDFTron SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our C# PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//
// Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
//

using System;
using System.IO;
using System.Threading.Tasks;
using Windows.Foundation;

using pdftron.PDF;
using pdftron.SDF;

using PDFNetUniversalSamples.ViewModels;

namespace PDFNetSamples
{
    public sealed class TextExtractTest : Sample
    {
        public TextExtractTest() :
            base("TextExtract", "The sample illustrates the basic text extraction capabilities of PDFNet.")
        {
        }

        public override IAsyncAction RunAsync()
        {
            return Task.Run(new System.Action(() => {
                WriteLine("--------------------------------");
                WriteLine("Starting TextExtract Test...");
                WriteLine("--------------------------------\n");
                bool example1_basic = true;
                bool example2_xml = true;
                bool example3_wordlist = true;
                bool example4_advanced = false;
                bool example5_low_level = false;

                // Sample code showing how to use high-level text extraction APIs.

                try
                {
                    string input_file_path = Path.Combine(InputPath, "newsletter.pdf");
                    WriteLine("Opening input file " + input_file_path);
                    PDFDoc doc = new PDFDoc(input_file_path);
                    doc.InitSecurityHandler();
                    pdftron.PDF.Page page = doc.GetPage(1);
                    if (page == null)
                    {
                        WriteLine("Page not found.");
                        return;
                    }

                    TextExtractor txt = new TextExtractor();
                    txt.Begin(page);  // Read the page.
                    // Other options you may want to consider...
                    // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                    // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                    // ...

                    string outputResult = string.Empty;

                    // Example 1. Get all text on the page in a single string.
                    // Words will be separated with space or new line characters.
                    if (example1_basic)
                    {
                        // Get the word count.
                        outputResult += string.Format("Word Count: {0}\n", txt.GetWordCount());
                        outputResult += string.Format("\n- GetAsText --------------------------\n{0}\n", txt.GetAsText());
                        outputResult += "-----------------------------------------------------------\n";
                    }

                    // Example 2. Get XML logical structure for the page.
                    if (example2_xml)
                    {
                        String text = txt.GetAsXML(TextExtractorXMLOutputFlags.e_words_as_elements | TextExtractorXMLOutputFlags.e_output_bbox | TextExtractorXMLOutputFlags.e_output_style_info);

                        outputResult += string.Format("\n\n- GetAsXML  --------------------------\n{0}\n", text);
                        outputResult += "-----------------------------------------------------------\n";
                    }

                    // Example 3. Extract words one by one.
                    if (example3_wordlist)
                    {
                        TextExtractorWord word;
                        for (TextExtractorLine line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                        {
                            string wordlist = "";
                            for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                            {
                                wordlist += " " + word.GetString();
                            }
                            outputResult += wordlist + "\n";
                        }
                        outputResult += "-----------------------------------------------------------\n";
                    }

                    // Print result of Example 1, 2 and 3
                    WriteLine(outputResult);

                    // Example 3. A more advanced text extraction example. 
                    // The output is XML structure containing paragraphs, lines, words, 
                    // as well as style and positioning information.
                    if (example4_advanced)
                    {
                        pdftron.PDF.Rect bbox;
                        int cur_flow_id = -1, cur_para_id = -1;

                        TextExtractorLine line;
                        TextExtractorWord word;
                        TextExtractorStyle s, line_style;

                        // For each line on the page...
                        for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                        {
                            if (line.GetNumWords() == 0)
                            {
                                continue;
                            }

                            if (cur_flow_id != line.GetFlowID())
                            {
                                if (cur_flow_id != -1)
                                {
                                    if (cur_para_id != -1)
                                    {
                                        cur_para_id = -1;
                                        WriteLine("</Para>");
                                    }
                                    WriteLine("</Flow>");
                                }
                                cur_flow_id = line.GetFlowID();
                                WriteLine(string.Format("<Flow id=\"{0}\">", cur_flow_id));
                            }

                            if (cur_para_id != line.GetParagraphID())
                            {
                                if (cur_para_id != -1)
                                    WriteLine("</Para>");
                                cur_para_id = line.GetParagraphID();
                                WriteLine(string.Format("<Para id=\"{0}\">", cur_para_id));
                            }

                            bbox = line.GetBBox();
                            line_style = line.GetStyle();
                            WriteLine(string.Format("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1, bbox.y1, bbox.x2, bbox.y2));
                            PrintStyle(line_style);
                            

                            // For each word in the line...
                            for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                            {
                                // Output the bounding box for the word.
                                bbox = word.GetBBox();
                                Write(string.Format("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1, bbox.y1, bbox.x2, bbox.y2));

                                int sz = word.GetStringLen();
                                if (sz == 0) continue;

                                // If the word style is different from the parent style, output the new style.
                                s = word.GetStyle();
                                if (s != line_style)
                                {
                                    PrintStyle(s);
                                }

                                WriteLine(string.Format(">\n{0}", word.GetString()));
                                WriteLine("</Word>");
                            }
                            WriteLine("</Line>");
                        }

                        if (cur_flow_id != -1)
                        {
                            if (cur_para_id != -1)
                            {
                                cur_para_id = -1;
                                WriteLine("</Para>");
                            }
                            WriteLine("</Flow>");
                        }
                    }

                    doc.Destroy();
                    WriteLine("Done.");
                }
                catch (Exception e)
                {
                    WriteLine(GetExceptionMessage(e));
                }

                // Sample code showing how to use low-level text extraction APIs.
                if (example5_low_level)
                {
                    try
                    {
                        LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                        PDFDoc doc = new PDFDoc(Path.Combine(InputPath, "newsletter.pdf"));
                        doc.InitSecurityHandler();

                        // Example 1. Extract all text content from the document
                        ElementReader reader = new ElementReader();
                        PageIterator itr = doc.GetPageIterator();
                        //for (; itr.HasNext(); itr.Next()) //  Read every page
                        {
                            reader.Begin(itr.Current());
                            //LowLevelTextExtractUtils.DumpAllText(reader);
                            WriteLine(util.DumpAllText(reader, this));
                            reader.End();
                        }

                        // Example 2. Extract text based on the selection rectangle.
                        WriteLine("----------------------------------------------------");
                        WriteLine("Extract text based on the selection rectangle.");
                        WriteLine("----------------------------------------------------");

                        pdftron.PDF.Page first_page = doc.GetPage(1);
                        string field1 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(27, 392, 563, 534), reader);
                        string field2 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(28, 551, 106, 623), reader);
                        string field3 = util.ReadTextFromRect(first_page, new pdftron.PDF.Rect(208, 550, 387, 621), reader);

                        WriteLine(string.Format("Field 1: {0}", field1));
                        WriteLine(string.Format("Field 2: {0}", field2));
                        WriteLine(string.Format("Field 3: {0}", field3));
                        // ... 


                        doc.Destroy();
                        WriteLine("Done.");
                    }
                    catch (Exception e)
                    {
                        WriteLine(GetExceptionMessage(e));
                    }
                }

                WriteLine("\n--------------------------------");
                WriteLine("Done TextExtract Test.");
                WriteLine("--------------------------------\n");
                Flush();
            })).AsAsyncAction();
        }

        void PrintStyle(TextExtractorStyle s)
        {
            WriteLine(string.Format(" style=\"font-family: {0}; font-size: {1}; {2}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif; " : " ")));
        }
    }

    sealed class LowLevelTextExtractUtils
    {
        // A utility method used to dump all text content in the 
        // console window.

        public String DumpAllText(ElementReader reader, Sample sample)
        {
            String result = "";
            Element element;
            //int i = 0;
            while ((element = reader.Next()) != null)
            {
                switch (element.GetType())
                {
                    case ElementType.e_text_begin:
                        result += ("--> Text Block Begin");
                        break;
                    case ElementType.e_text_end:
                        result += ("--> Text Block End");
                        break;
                    case ElementType.e_text:
                        {
                            pdftron.PDF.Rect bbox = new pdftron.PDF.Rect();
                            element.GetBBox(bbox);
                            result += (string.Format("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2));

                            String txt = element.GetTextString();
                            sample.WriteLine(txt);
                            break;
                        }
                    case ElementType.e_text_new_line:
                        {
                            result += ("--> New Line");
                            break;
                        }
                    case ElementType.e_form: // Process form XObjects
                        {
                            reader.FormBegin();
                            DumpAllText(reader, sample);
                            reader.End();
                            break;
                        }
                }
            }
            return result;
        }

        string _srch_str;

        // A helper method for ReadTextFromRect
        void RectTextSearch(ElementReader reader, pdftron.PDF.Rect pos)
        {
            Element element;
            while ((element = reader.Next()) != null)
            {
                switch (element.GetType())
                {
                    case ElementType.e_text:
                        {
                            pdftron.PDF.Rect bbox = new pdftron.PDF.Rect();
                            element.GetBBox(bbox);
                            if (bbox.IntersectRect(bbox, pos))
                            {
                                _srch_str += element.GetTextString();
                                _srch_str += "\n"; // add a new line?
                            }
                            break;
                        }
                    case ElementType.e_text_new_line:
                        {
                            break;
                        }
                    case ElementType.e_form: // Process form XObjects
                        {
                            reader.FormBegin();
                            RectTextSearch(reader, pos);
                            reader.End();
                            break;
                        }
                }
            }
        }

        // A utility method used to extract all text content from
        // a given selection rectangle. The rectangle coordinates are
        // expressed in PDF user/page coordinate system.
        public string ReadTextFromRect(pdftron.PDF.Page page, pdftron.PDF.Rect pos, ElementReader reader)
        {
            _srch_str = "";
            reader.Begin(page);
            RectTextSearch(reader, pos);
            reader.End();
            return _srch_str;
        }
    }
}