Some test text!

Search
Hamburger Icon

Read a PDF file in Kotlin (parse & extract text)

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample Kotlin code for using PDFTron SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Kotlin PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------

package com.pdftron.android.pdfnetsdksamples.samples

import com.pdftron.android.pdfnetsdksamples.OutputListener
import com.pdftron.android.pdfnetsdksamples.PDFNetSample
import com.pdftron.android.pdfnetsdksamples.R
import com.pdftron.android.pdfnetsdksamples.util.Utils
import com.pdftron.common.PDFNetException
import com.pdftron.pdf.*
import java.text.DecimalFormat
import java.util.*

class TextExtractTest : PDFNetSample() {
    init {
        setTitle(R.string.sample_textextract_title)
        setDescription(R.string.sample_textextract_description)
    }

    override fun run(outputListener: OutputListener?) {
        super.run(outputListener)
        mOutputListener = outputListener
        mFileList.clear()
        printHeader(outputListener!!)

        // string output_path = "../../TestFiles/Output/";
        val example1_basic = false
        val example2_xml = false
        val example3_wordlist = false
        val example4_advanced = true
        val example5_low_level = false

        // Sample code showing how to use high-level text extraction APIs.
        try {
            val doc = PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath)
            doc.initSecurityHandler()

            val page = doc.getPage(1)
            if (page == null) {
                mOutputListener!!.println("Page not found.")
            }

            val txt = TextExtractor()
            txt.begin(page!!)  // Read the page.
            // Other options you may want to consider...
            // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
            // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
            // ...

            // Example 1. Get all text on the page in a single string.
            // Words will be separated with space or new line characters.
            if (example1_basic) {
                // Get the word count.
                mOutputListener!!.println("Word Count: " + txt.wordCount)

                mOutputListener!!.println("\n\n- GetAsText --------------------------\n" + txt.asText)
                mOutputListener!!.println("-----------------------------------------------------------")
            }

            // Example 2. Get XML logical structure for the page.
            if (example2_xml) {
                val text = txt.getAsXML(TextExtractor.e_words_as_elements or TextExtractor.e_output_bbox or TextExtractor.e_output_style_info)
                mOutputListener!!.println("\n\n- GetAsXML  --------------------------\n$text")
                mOutputListener!!.println("-----------------------------------------------------------")
            }

            // Example 3. Extract words one by one.
            if (example3_wordlist) {
                var word: TextExtractor.Word
                var line: TextExtractor.Line = txt.firstLine
                while (line.isValid) {
                    word = line.firstWord
                    while (word.isValid) {
                        mOutputListener!!.println(word.string)
                        word = word.nextWord
                    }
                    line = line.nextLine
                }
                mOutputListener!!.println("-----------------------------------------------------------")
            }

            // Example 4. A more advanced text extraction example.
            // The output is XML structure containing paragraphs, lines, words,
            // as well as style and positioning information.
            if (example4_advanced) {
                var bbox: Rect
                var cur_flow_id = -1
                var cur_para_id = -1

                var line: TextExtractor.Line
                var word: TextExtractor.Word
                var s: TextExtractor.Style
                var line_style: TextExtractor.Style

                mOutputListener!!.println("<PDFText>")
                // For each line on the page...
                line = txt.firstLine
                while (line.isValid) {
                    if (line.numWords == 0) {
                        line = line.nextLine
                        continue
                    }
                    if (cur_flow_id != line.flowID) {
                        if (cur_flow_id != -1) {
                            if (cur_para_id != -1) {
                                cur_para_id = -1
                                mOutputListener!!.println("</Para>")
                            }
                            mOutputListener!!.println("</Flow>")
                        }
                        cur_flow_id = line.flowID
                        mOutputListener!!.println("<Flow id=\"$cur_flow_id\">")
                    }

                    if (cur_para_id != line.paragraphID) {
                        if (cur_para_id != -1)
                            mOutputListener!!.println("</Para>")
                        cur_para_id = line.paragraphID
                        mOutputListener!!.println("<Para id=\"$cur_para_id\">")
                    }

                    bbox = line.bBox
                    line_style = line.style
                    mOutputListener!!.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
                    printStyle(line_style)
                    mOutputListener!!.println(" cur_num=\"" + line.currentNum + "\">")

                    // For each word in the line...
                    word = line.firstWord
                    while (word.isValid) {
                        // Output the bounding box for the word.
                        bbox = word.bBox
                        mOutputListener!!.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
                        mOutputListener!!.print(" cur_num=\"" + word.currentNum + "\"")
                        val sz = word.stringLen
                        if (sz == 0) {
                            word = word.nextWord
                            continue
                        }

                        // If the word style is different from the parent style, output the new style.
                        s = word.style
                        if (s != line_style) {
                            printStyle(s)
                        }

                        mOutputListener!!.print(">" + word.string)
                        mOutputListener!!.println("</Word>")
                        word = word.nextWord
                    }
                    mOutputListener!!.println("</Line>")
                    line = line.nextLine
                }

                if (cur_flow_id != -1) {
                    if (cur_para_id != -1) {
                        cur_para_id = -1
                        mOutputListener!!.println("</Para>")
                    }
                    mOutputListener!!.println("</Flow>")
                }
            }
            txt.destroy()
            doc.close()
            mOutputListener!!.println("</PDFText>")
        } catch (e: PDFNetException) {
            mOutputListener!!.printError(e.stackTrace)
        }

        // Sample code showing how to use low-level text extraction APIs.
        if (example5_low_level) {
            try {
                val doc = PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath)
                doc.initSecurityHandler()

                // Example 1. Extract all text content from the document

                val reader = ElementReader()
                //  Read every page
                val itr = doc.pageIterator
                while (itr.hasNext()) {
                    reader.begin(itr.next())
                    DumpAllText(reader)
                    reader.end()
                }

                // Example 2. Extract text content based on the
                // selection rectangle.
                mOutputListener!!.print("\n----------------------------------------------------")
                mOutputListener!!.print("\nExtract text based on the selection rectangle.")
                mOutputListener!!.println("\n----------------------------------------------------")

                val first_page = doc.pageIterator.next()!!
                var s1 = ReadTextFromRect(first_page, Rect(27.0, 392.0, 563.0, 534.0), reader)
                mOutputListener!!.print("\nField 1: $s1")

                s1 = ReadTextFromRect(first_page, Rect(28.0, 551.0, 106.0, 623.0), reader)
                mOutputListener!!.print("\nField 2: $s1")

                s1 = ReadTextFromRect(first_page, Rect(208.0, 550.0, 387.0, 621.0), reader)
                mOutputListener!!.print("\nField 3: $s1")

                // ...
                doc.close()
                mOutputListener!!.println("Done.")
            } catch (e: Exception) {
                mOutputListener!!.printError(e.stackTrace)
            }

        }

        for (file in mFileList) {
            addToFileList(file)
        }
        printFooter(outputListener)
    }

    companion object {

        private var mOutputListener: OutputListener? = null

        private val mFileList = ArrayList<String>()

        internal fun printStyle(s: TextExtractor.Style) {
            val r = s.color[0]
            val g = s.color[1]
            val b = s.color[2]
            val rgb_hex = String.format("%02X%02X%02X;", r, g, b)
            val df = DecimalFormat("#.#")
            mOutputListener!!.print(" style=\"font-family:" + s.fontName + "; "
                    + "font-size:" + df.format(s.fontSize) + ";"
                    + (if (s.isSerif) " sans-serif; " else " ")
                    + "color:#" + rgb_hex + "\"")
        }

        // A utility method used to dump all text content in the console window.
        @Throws(PDFNetException::class)
        internal fun DumpAllText(reader: ElementReader) {
            var element: Element?
            while (true) {
                element = reader.next()
                if (element == null) {
                    break
                }
                when (element.type) {
                    Element.e_text_begin -> mOutputListener!!.println("\n--> Text Block Begin")
                    Element.e_text_end -> mOutputListener!!.println("\n--> Text Block End")
                    Element.e_text -> {
                        val bbox = element.bBox
                        if (bbox != null) {
                            mOutputListener!!.println("\n--> BBox: " + bbox.x1 + ", "
                                    + bbox.y1 + ", "
                                    + bbox.x2 + ", "
                                    + bbox.y2)

                            val arr = element.textString
                            mOutputListener!!.println(arr)
                        }
                    }
                    Element.e_text_new_line -> mOutputListener!!.println("\n--> New Line")
                    Element.e_form                // Process form XObjects
                    -> {
                        reader.formBegin()
                        DumpAllText(reader)
                        reader.end()
                    }
                }
            }
        }

        // A helper method for ReadTextFromRect
        @Throws(PDFNetException::class)
        internal fun RectTextSearch(reader: ElementReader, pos: Rect): String {
            var element: Element?
            var srch_str = String()
            while (true) {
                element = reader.next()
                if (element == null) {
                    break
                }
                when (element.type) {
                    Element.e_text -> {
                        val bbox = element.bBox
                        if (bbox != null) {
                            if (bbox.intersectRect(bbox, pos)) {
                                val arr = element.textString
                                srch_str += arr
                                srch_str += "\n" // add a new line?
                            }
                        }
                    }
                    Element.e_text_new_line -> {
                    }
                    Element.e_form // Process form XObjects
                    -> {
                        reader.formBegin()
                        srch_str += RectTextSearch(reader, pos)
                        reader.end()
                    }
                }
            }
            return srch_str
        }

        // A utility method used to extract all text content from
        // a given selection rectangle. The rectangle coordinates are
        // expressed in PDF user/page coordinate system.
        @Throws(PDFNetException::class)
        internal fun ReadTextFromRect(page: Page, pos: Rect, reader: ElementReader): String {
            reader.begin(page)
            val srch_str = RectTextSearch(reader, pos)
            reader.end()
            return srch_str
        }
    }

}