More languages
Some test text!
More languages
Sample Kotlin code for using PDFTron SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Kotlin PDF Library and PDF Parsing & Content Extraction Library.
Get Started Samples DownloadTo run this sample, get started with a free trial of Apryse SDK.
//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------
package com.pdftron.android.pdfnetsdksamples.samples
import com.pdftron.android.pdfnetsdksamples.OutputListener
import com.pdftron.android.pdfnetsdksamples.PDFNetSample
import com.pdftron.android.pdfnetsdksamples.R
import com.pdftron.android.pdfnetsdksamples.util.Utils
import com.pdftron.common.PDFNetException
import com.pdftron.pdf.*
import java.text.DecimalFormat
import java.util.*
class TextExtractTest : PDFNetSample() {
init {
setTitle(R.string.sample_textextract_title)
setDescription(R.string.sample_textextract_description)
}
override fun run(outputListener: OutputListener?) {
super.run(outputListener)
mOutputListener = outputListener
mFileList.clear()
printHeader(outputListener!!)
// string output_path = "../../TestFiles/Output/";
val example1_basic = false
val example2_xml = false
val example3_wordlist = false
val example4_advanced = true
val example5_low_level = false
// Sample code showing how to use high-level text extraction APIs.
try {
val doc = PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath)
doc.initSecurityHandler()
val page = doc.getPage(1)
if (page == null) {
mOutputListener!!.println("Page not found.")
}
val txt = TextExtractor()
txt.begin(page!!) // Read the page.
// Other options you may want to consider...
// txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
// txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
// ...
// Example 1. Get all text on the page in a single string.
// Words will be separated with space or new line characters.
if (example1_basic) {
// Get the word count.
mOutputListener!!.println("Word Count: " + txt.wordCount)
mOutputListener!!.println("\n\n- GetAsText --------------------------\n" + txt.asText)
mOutputListener!!.println("-----------------------------------------------------------")
}
// Example 2. Get XML logical structure for the page.
if (example2_xml) {
val text = txt.getAsXML(TextExtractor.e_words_as_elements or TextExtractor.e_output_bbox or TextExtractor.e_output_style_info)
mOutputListener!!.println("\n\n- GetAsXML --------------------------\n$text")
mOutputListener!!.println("-----------------------------------------------------------")
}
// Example 3. Extract words one by one.
if (example3_wordlist) {
var word: TextExtractor.Word
var line: TextExtractor.Line = txt.firstLine
while (line.isValid) {
word = line.firstWord
while (word.isValid) {
mOutputListener!!.println(word.string)
word = word.nextWord
}
line = line.nextLine
}
mOutputListener!!.println("-----------------------------------------------------------")
}
// Example 4. A more advanced text extraction example.
// The output is XML structure containing paragraphs, lines, words,
// as well as style and positioning information.
if (example4_advanced) {
var bbox: Rect
var cur_flow_id = -1
var cur_para_id = -1
var line: TextExtractor.Line
var word: TextExtractor.Word
var s: TextExtractor.Style
var line_style: TextExtractor.Style
mOutputListener!!.println("<PDFText>")
// For each line on the page...
line = txt.firstLine
while (line.isValid) {
if (line.numWords == 0) {
line = line.nextLine
continue
}
if (cur_flow_id != line.flowID) {
if (cur_flow_id != -1) {
if (cur_para_id != -1) {
cur_para_id = -1
mOutputListener!!.println("</Para>")
}
mOutputListener!!.println("</Flow>")
}
cur_flow_id = line.flowID
mOutputListener!!.println("<Flow id=\"$cur_flow_id\">")
}
if (cur_para_id != line.paragraphID) {
if (cur_para_id != -1)
mOutputListener!!.println("</Para>")
cur_para_id = line.paragraphID
mOutputListener!!.println("<Para id=\"$cur_para_id\">")
}
bbox = line.bBox
line_style = line.style
mOutputListener!!.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
printStyle(line_style)
mOutputListener!!.println(" cur_num=\"" + line.currentNum + "\">")
// For each word in the line...
word = line.firstWord
while (word.isValid) {
// Output the bounding box for the word.
bbox = word.bBox
mOutputListener!!.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
mOutputListener!!.print(" cur_num=\"" + word.currentNum + "\"")
val sz = word.stringLen
if (sz == 0) {
word = word.nextWord
continue
}
// If the word style is different from the parent style, output the new style.
s = word.style
if (s != line_style) {
printStyle(s)
}
mOutputListener!!.print(">" + word.string)
mOutputListener!!.println("</Word>")
word = word.nextWord
}
mOutputListener!!.println("</Line>")
line = line.nextLine
}
if (cur_flow_id != -1) {
if (cur_para_id != -1) {
cur_para_id = -1
mOutputListener!!.println("</Para>")
}
mOutputListener!!.println("</Flow>")
}
}
txt.destroy()
doc.close()
mOutputListener!!.println("</PDFText>")
} catch (e: PDFNetException) {
mOutputListener!!.printError(e.stackTrace)
}
// Sample code showing how to use low-level text extraction APIs.
if (example5_low_level) {
try {
val doc = PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath)
doc.initSecurityHandler()
// Example 1. Extract all text content from the document
val reader = ElementReader()
// Read every page
val itr = doc.pageIterator
while (itr.hasNext()) {
reader.begin(itr.next())
DumpAllText(reader)
reader.end()
}
// Example 2. Extract text content based on the
// selection rectangle.
mOutputListener!!.print("\n----------------------------------------------------")
mOutputListener!!.print("\nExtract text based on the selection rectangle.")
mOutputListener!!.println("\n----------------------------------------------------")
val first_page = doc.pageIterator.next()!!
var s1 = ReadTextFromRect(first_page, Rect(27.0, 392.0, 563.0, 534.0), reader)
mOutputListener!!.print("\nField 1: $s1")
s1 = ReadTextFromRect(first_page, Rect(28.0, 551.0, 106.0, 623.0), reader)
mOutputListener!!.print("\nField 2: $s1")
s1 = ReadTextFromRect(first_page, Rect(208.0, 550.0, 387.0, 621.0), reader)
mOutputListener!!.print("\nField 3: $s1")
// ...
doc.close()
mOutputListener!!.println("Done.")
} catch (e: Exception) {
mOutputListener!!.printError(e.stackTrace)
}
}
for (file in mFileList) {
addToFileList(file)
}
printFooter(outputListener)
}
companion object {
private var mOutputListener: OutputListener? = null
private val mFileList = ArrayList<String>()
internal fun printStyle(s: TextExtractor.Style) {
val r = s.color[0]
val g = s.color[1]
val b = s.color[2]
val rgb_hex = String.format("%02X%02X%02X;", r, g, b)
val df = DecimalFormat("#.#")
mOutputListener!!.print(" style=\"font-family:" + s.fontName + "; "
+ "font-size:" + df.format(s.fontSize) + ";"
+ (if (s.isSerif) " sans-serif; " else " ")
+ "color:#" + rgb_hex + "\"")
}
// A utility method used to dump all text content in the console window.
@Throws(PDFNetException::class)
internal fun DumpAllText(reader: ElementReader) {
var element: Element?
while (true) {
element = reader.next()
if (element == null) {
break
}
when (element.type) {
Element.e_text_begin -> mOutputListener!!.println("\n--> Text Block Begin")
Element.e_text_end -> mOutputListener!!.println("\n--> Text Block End")
Element.e_text -> {
val bbox = element.bBox
if (bbox != null) {
mOutputListener!!.println("\n--> BBox: " + bbox.x1 + ", "
+ bbox.y1 + ", "
+ bbox.x2 + ", "
+ bbox.y2)
val arr = element.textString
mOutputListener!!.println(arr)
}
}
Element.e_text_new_line -> mOutputListener!!.println("\n--> New Line")
Element.e_form // Process form XObjects
-> {
reader.formBegin()
DumpAllText(reader)
reader.end()
}
}
}
}
// A helper method for ReadTextFromRect
@Throws(PDFNetException::class)
internal fun RectTextSearch(reader: ElementReader, pos: Rect): String {
var element: Element?
var srch_str = String()
while (true) {
element = reader.next()
if (element == null) {
break
}
when (element.type) {
Element.e_text -> {
val bbox = element.bBox
if (bbox != null) {
if (bbox.intersectRect(bbox, pos)) {
val arr = element.textString
srch_str += arr
srch_str += "\n" // add a new line?
}
}
}
Element.e_text_new_line -> {
}
Element.e_form // Process form XObjects
-> {
reader.formBegin()
srch_str += RectTextSearch(reader, pos)
reader.end()
}
}
}
return srch_str
}
// A utility method used to extract all text content from
// a given selection rectangle. The rectangle coordinates are
// expressed in PDF user/page coordinate system.
@Throws(PDFNetException::class)
internal fun ReadTextFromRect(page: Page, pos: Rect, reader: ElementReader): String {
reader.begin(page)
val srch_str = RectTextSearch(reader, pos)
reader.end()
return srch_str
}
}
}