More languages
Some test text!
More languages
Sample Kotlin code for using PDFTron SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our Kotlin PDF Library and PDF Parsing & Content Extraction Library.
Get Started Samples DownloadTo run this sample, get started with a free trial of Apryse SDK.
//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------
package com.pdftron.android.pdfnetsdksamples.samples
import com.pdftron.android.pdfnetsdksamples.OutputListener
import com.pdftron.android.pdfnetsdksamples.PDFNetSample
import com.pdftron.android.pdfnetsdksamples.R
import com.pdftron.android.pdfnetsdksamples.util.Utils
import com.pdftron.common.PDFNetException
import com.pdftron.filters.FilterReader
import com.pdftron.pdf.*
import java.util.*
class ElementReaderAdvTest : PDFNetSample() {
init {
setTitle(R.string.sample_elementreaderadv_title)
setDescription(R.string.sample_elementreaderadv_description)
}
override fun run(outputListener: OutputListener?) {
super.run(outputListener)
mOutputListener = outputListener
mFileList.clear()
printHeader(outputListener!!)
// string output_path = "../../TestFiles/Output/";
try
// Extract text data from all pages in the document
{
mOutputListener!!.println("__________________________________________________")
mOutputListener!!.println("Extract page element information from all ")
mOutputListener!!.println("pages in the document.")
val doc = PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath)
doc.initSecurityHandler()
val pgnum = doc.pageCount
val page_begin = doc.pageIterator
val page_reader = ElementReader()
val itr: PageIterator
itr = page_begin
while (itr.hasNext())
// Read every page
{
val nextPage = itr.next()!!
mOutputListener!!.println("Page " + nextPage.getIndex() +
"----------------------------------------")
page_reader.begin(nextPage)
ProcessElements(page_reader)
page_reader.end()
}
//Close the open document to free up document
//memory sooner than waiting for the
//garbage collector
doc.close()
mOutputListener!!.println("Done")
} catch (e: Exception) {
mOutputListener!!.printError(e.stackTrace)
}
for (file in mFileList) {
addToFileList(file)
}
printFooter(outputListener)
}
companion object {
private var mOutputListener: OutputListener? = null
private val mFileList = ArrayList<String>()
internal var m_buf: String? = null
@Throws(PDFNetException::class)
internal fun ProcessPath(reader: ElementReader, path: Element) {
if (path.isClippingPath) {
mOutputListener!!.println("This is a clipping path")
}
val pathData = path.pathData
val data = pathData.points
val opr = pathData.operators
var x1: Double
var y1: Double
var x2: Double
var y2: Double
var x3: Double
var y3: Double
// Use path.getCTM() if you are interested in CTM (current transformation matrix).
mOutputListener!!.print(" Path Data Points := \"")
var data_index = 0
for (opr_index in opr.indices) {
when (opr[opr_index]) {
PathData.e_moveto.toByte() -> {
x1 = data[data_index]
++data_index
y1 = data[data_index]
++data_index
mOutputListener!!.print("M$x1 $y1")
}
PathData.e_lineto.toByte() -> {
x1 = data[data_index]
++data_index
y1 = data[data_index]
++data_index
mOutputListener!!.print(" L$x1 $y1")
}
PathData.e_cubicto.toByte() -> {
x1 = data[data_index]
++data_index
y1 = data[data_index]
++data_index
x2 = data[data_index]
++data_index
y2 = data[data_index]
++data_index
x3 = data[data_index]
++data_index
y3 = data[data_index]
++data_index
mOutputListener!!.print(" C$x1 $y1 $x2 $y2 $x3 $y3")
}
PathData.e_rect.toByte() -> {
x1 = data[data_index]
++data_index
y1 = data[data_index]
++data_index
val w = data[data_index]
++data_index
val h = data[data_index]
++data_index
x2 = x1 + w
y2 = y1
x3 = x2
y3 = y1 + h
mOutputListener!!.print("M$x1 $y1 L$x2 $y2 L$x3 $y3 L$x1 $y3 Z")
}
PathData.e_closepath.toByte() -> mOutputListener!!.println(" Close Path")
else -> throw PDFNetException("Invalid Element Type", 0, "", "", "")
}
}
mOutputListener!!.print("\" ")
val gs = path.gState
// Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
if (path.isStroked) {
mOutputListener!!.println("Stroke path")
if (gs.strokeColorSpace.type == ColorSpace.e_pattern) {
mOutputListener!!.println("Path has associated pattern")
} else {
// Get stroke color (you can use PDFNet color conversion facilities)
var rgb = ColorPt()
rgb = gs.strokeColor
var v = rgb.get(0)
rgb = gs.strokeColorSpace.convert2RGB(rgb)
v = rgb.get(0)
}
} else {
// Do not stroke path
}
if (path.isFilled) {
mOutputListener!!.println("Fill path")
if (gs.fillColorSpace.type == ColorSpace.e_pattern) {
mOutputListener!!.println("Path has associated pattern")
val pat = gs.fillPattern
val type = pat.type
if (type == PatternColor.e_shading) {
mOutputListener!!.println("Shading")
val shading = pat.shading
if (shading.type == Shading.e_function_shading) {
mOutputListener!!.println("FUNCT")
} else if (shading.type == Shading.e_axial_shading) {
mOutputListener!!.println("AXIAL")
} else if (shading.type == Shading.e_radial_shading) {
mOutputListener!!.println("RADIAL")
}
} else if (type == PatternColor.e_colored_tiling_pattern) {
mOutputListener!!.println("e_colored_tiling_pattern")
} else if (type == PatternColor.e_uncolored_tiling_pattern) {
mOutputListener!!.println("e_uncolored_tiling_pattern")
} else {
mOutputListener!!.println("?")
}
} else {
var rgb = ColorPt()
rgb = gs.fillColor
var v = rgb.get(0)
rgb = gs.fillColorSpace.convert2RGB(rgb)
v = rgb.get(0)
}
} else {
// Do not fill path
}
// Process any changes in graphics state ---------------------------------
val gs_itr = reader.changesIterator
while (gs_itr.hasNext()) {
when (gs_itr.next()!!.toInt()) {
GState.e_transform -> {
}
GState.e_line_width -> {
}
GState.e_line_cap -> {
}
GState.e_line_join -> {
}
GState.e_flatness -> {
}
GState.e_miter_limit -> {
}
GState.e_dash_pattern -> {
//double[] dashes;
//dashes=gs.getDashes();
//gs.getPhase();
}
GState.e_fill_color -> {
if (gs.fillColorSpace.type == ColorSpace.e_pattern && gs.fillPattern.type != PatternColor.e_shading) {
//process the pattern data
reader.patternBegin(true)
ProcessElements(reader)
reader.end()
}
}
}// Get transform matrix for this element. Unlike path.GetCTM()
// that return full transformation matrix gs.GetTransform() return
// only the transformation matrix that was installed for this element.
//
//gs.getTransform();
//gs.getLineWidth();
//gs.getLineCap();
//gs.getLineJoin();
//gs.getMiterLimit();
}
reader.clearChangeList()
}
@Throws(PDFNetException::class)
internal fun ProcessText(page_reader: ElementReader) {
// Begin text element
mOutputListener!!.println("Begin Text Block:")
var element: Element?
while (true) {
element = page_reader.next()
if (element == null) {
break
}
when (element.type) {
Element.e_text_end -> {
// Finish the text block
mOutputListener!!.println("End Text Block.")
return
}
Element.e_text -> {
val gs = element.gState
val cs_fill = gs.fillColorSpace
val fill = gs.fillColor
val out: ColorPt
out = cs_fill.convert2RGB(fill)
val cs_stroke = gs.strokeColorSpace
val stroke = gs.strokeColor
val font = gs.font
mOutputListener!!.println("Font Name: " + font.name)
//font.isFixedWidth();
//font.isSerif();
//font.isSymbolic();
//font.isItalic();
// ...
//double font_size = gs.getFontSize();
//double word_spacing = gs.getWordSpacing();
//double char_spacing = gs.getCharSpacing();
//String txt = element.getTextString();
if (font.type == Font.e_Type3) {
//type 3 font, process its data
val itr = element.charIterator
while (itr.hasNext()) {
page_reader.type3FontBegin(itr.next(), null)
ProcessElements(page_reader)
page_reader.end()
}
} else {
val text_mtx = element.textMatrix
var x: Double
var y: Double
var char_code: Long
val itr = element.charIterator
while (itr.hasNext()) {
val data = itr.next()!!
char_code = data.getCharCode()
//mOutputListener.print("Character code: ");
mOutputListener!!.print(char_code.toString())
x = data.getGlyphX() // character positioning information
y = data.getGlyphY()
// Use element.getCTM() if you are interested in the CTM
// (current transformation matrix).
val ctm = element.ctm
// To get the exact character positioning information you need to
// concatenate current text matrix with CTM and then multiply
// relative positioning coordinates with the resulting matrix.
//
val mtx = ctm.multiply(text_mtx)
val t = mtx.multPoint(x, y)
x = t.x
y = t.y
//mOutputListener.println(" Position: x=" + x + " y=" + y );
}
mOutputListener!!.println()
}
}
}
}
}
@Throws(PDFNetException::class)
internal fun ProcessImage(image: Element) {
val image_mask = image.isImageMask
val interpolate = image.isImageInterpolate
val width = image.imageWidth
val height = image.imageHeight
val out_data_sz = width * height * 3
mOutputListener!!.println("Image: " +
" width=\"" + width + "\""
+ " height=\"" + height)
// Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
// You can use GetImageData to read the raw (decoded) image data
//image->GetBitsPerComponent();
//image->GetImageData(); // get raw image data
// .... or use Image2RGB filter that converts every image to RGB format,
// This should save you time since you don't need to deal with color conversions,
// image up-sampling, decoding etc.
val img_conv = Image2RGB(image) // Extract and convert image to RGB 8-bpc format
val reader = FilterReader(img_conv)
// A buffer used to keep image data.
val buf = ByteArray(out_data_sz)
val image_data_out = reader.read(buf)
// &image_data_out.front() contains RGB image data.
// Note that you don't need to read a whole image at a time. Alternatively
// you can read a chunk at a time by repeatedly calling reader.Read(buf)
// until the function returns 0.
}
@Throws(PDFNetException::class)
internal fun ProcessElements(reader: ElementReader) {
var element: Element?
// Read page contents
while (true) {
element = reader.next()
if (element == null) {
break
}
when (element.type) {
Element.e_path // Process path data...
-> {
ProcessPath(reader, element)
}
Element.e_text_begin // Process text block...
-> {
ProcessText(reader)
}
Element.e_form // Process form XObjects
-> {
reader.formBegin()
ProcessElements(reader)
reader.end()
}
Element.e_image // Process Images
-> {
ProcessImage(element)
}
}
}
}
}
}