More languages
Some test text!
More languages
Sample Swift code for using PDFTron SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our Swift PDF Library and PDF Parsing & Content Extraction Library.
Get Started Samples DownloadTo run this sample, get started with a free trial of Apryse SDK.
//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------
import PDFNet
import Foundation
func ProcessPath(reader: PTElementReader, path: PTElement) {
if path.isClippingPath() {
print("This is a clipping path")
}
let pathData: PTPathData = path.getPathData()
let data: NSMutableArray = pathData.getPoints()
let opr: Data = pathData.getOperators()
var opr_index: Int = 0
let opr_end: Int = opr.count
var data_index: Int = 0
var x1: Double = 0.0
var y1: Double = 0.0
var x2: Double = 0.0
var y2: Double = 0.0
var x3: Double = 0.0
var y3: Double = 0.0
var str = ""
// Use path.GetCTM() if you are interested in CTM (current transformation matrix).
str += (" Path Data Points := \"")
while opr_index < opr_end {
switch PTPathSegmentType(rawValue: UInt32(opr[opr_index])) {
case e_ptmoveto:
x1 = data[data_index] as! Double
data_index += 1
y1 = data[data_index] as! Double
data_index += 1
str += String(format: "M%.5g %.5g", x1, y1)
case e_ptlineto:
x1 = data[data_index] as! Double
data_index += 1
y1 = data[data_index] as! Double
data_index += 1
str += String(format: " L%.5g %.5g", x1, y1)
case e_ptcubicto:
x1 = data[data_index] as! Double
data_index += 1
y1 = data[data_index] as! Double
data_index += 1
x2 = data[data_index] as! Double
data_index += 1
y2 = data[data_index] as! Double
data_index += 1
x3 = data[data_index] as! Double
data_index += 1
y3 = data[data_index] as! Double
data_index += 1
str += String(format: " C%.5g %.5g %.5g %.5g %.5g %.5g", x1, y1, x2, y2, x3, y3)
case e_ptrect:
x1 = data[data_index] as! Double
data_index += 1
y1 = data[data_index] as! Double
data_index += 1
let w = data[data_index] as! Double
data_index += 1
let h = data[data_index] as! Double
data_index += 1
x2 = x1 + w
y2 = y1
x3 = x2
y3 = y1 + h
let x4: Double = x1
let y4: Double = y3
str += String(format: "M%.5g %.5g L%.5g %.5g L%.5g %.5g L%.5g %.5g Z", x1, y1, x2, y2, x3, y3, x4, y4)
case e_ptclosepath:
str += (" Close Path")
default:
assert(false)
}
opr_index = opr_index + 1
}
str += ("\" ")
let gs: PTGState = path.getGState()
// Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
if path.isStroked() {
str = str + ("Stroke path")
if gs.getStrokeColorSpace().getType() == e_ptpattern {
str = str + ("Path has associated pattern")
}
else {
// Get stroke color (you can use PDFNet color conversion facilities)
// let rgb: PTColorPt = gs.getStrokeColorSpace().convert2RGB(gs.getStrokeColor())
}
}
else {
// Do not stroke path
}
if path.isFilled() {
str = str + ("Fill path")
if gs.getFillColorSpace().getType() == e_ptpattern {
str = str + ("Path has associated pattern")
}
else {
// let rgb: PTColorPt = gs.getFillColorSpace().convert2RGB(gs.getFillColor())
}
}
else {
// Do not fill path
}
// Process any changes in graphics state ---------------------------------
let gs_itr: PTGSChangesIterator = reader.getChangesIterator()
while gs_itr.hasNext() {
switch PTGStateAttribute(rawValue: UInt32(gs_itr.current())) {
case e_pttransform:
// Get transform matrix for this element. Unlike path.GetCTM()
// that return full transformation matrix gs.GetTransform() return
// only the transformation matrix that was installed for this element.
//
// gs.getTransform()
break
case e_ptline_width:
// gs.getLineWidth()
break
case e_ptline_cap:
// gs.getLineCap()
break
case e_ptline_join:
// gs.getLineJoin()
break
case e_ptflatness:
break
case e_ptmiter_limit:
// gs.GetmiterLimit()
break
case e_ptdash_pattern:
// let dashes: NSMutableArray = gs.getDashes()
// gs.getPhase()
break
case e_ptfill_color:
if gs.getFillColorSpace().getType() == e_ptpattern && gs.getFillPattern().getType() != e_ptshading {
//process the pattern data
reader.patternBegin(true, reset_ctm_tfm: false)
ProcessElements(reader: reader)
reader.end()
}
default:
break
}
gs_itr.next()
}
reader.clearChangeList()
print("\(str)")
}
func ProcessText(page_reader: PTElementReader) {
// Begin text element
print("Begin Text Block:")
while let element = page_reader.next() {
switch element.getType() {
case e_pttext_end:
// Finish the text block
print("End Text Block.")
return
case e_pttext_obj:
let gs: PTGState = element.getGState()
let cs_fill: PTColorSpace = gs.getFillColorSpace()
let fill: PTColorPt = gs.getFillColor()
let _: PTColorPt = cs_fill.convert2RGB(fill) // outColor
let _: PTColorSpace = gs.getStrokeColorSpace() // cs_stroke
let _: PTColorPt = gs.getStrokeColor() // stroke
let font: PTFont = gs.getFont()
print("Font Name: \(font.getName()!)")
// font.IsFixedWidth();
// font.IsSerif();
// font.IsSymbolic();
// font.IsItalic();
// ...
// double font_size = gs.GetFontSize();
// double word_spacing = gs.GetWordSpacing();
// double char_spacing = gs.GetCharSpacing();
// const UString* txt = element.GetTextString();
if font.getType() == e_ptType3 {
//type 3 font, process its data
let itr: PTCharIterator = element.getCharIterator()
while itr.hasNext() {
page_reader.type3FontBegin(itr.current(), resource_dict: nil)
ProcessElements(reader: page_reader)
page_reader.end()
itr.next()
}
}
else {
let text_mtx: PTMatrix2D = element.getTextMatrix()
var x: Double
var y: Double
var char_code: UInt32
var str = ""
let itr: PTCharIterator = element.getCharIterator()
while itr.hasNext() {
char_code = itr.current().getChar_code()
if char_code >= 32 || char_code <= 255 {
// Print if in ASCII range...
if let scalar = UnicodeScalar(char_code){
str += ("\(Character(scalar))")
}
}
x = itr.current().getX() // character positioning information
y = itr.current().getY()
// Use element.getCTM() if you are interested in the CTM
// (current transformation matrix).
let ctm: PTMatrix2D = element.getCTM()
// To get the exact character positioning information you need to
// concatenate current text matrix with CTM and then multiply
// relative positioning coordinates with the resulting matrix.
let mtx: PTMatrix2D = text_mtx
mtx.concat(ctm.getM_a(), b: ctm.getM_b(), c: ctm.getM_c(), d: ctm.getM_d(), h: ctm.getM_h(), v: ctm.getM_v())
mtx.mult(PTPDFPoint(px: x, py: y))
// Get glyph path...
//vector<UChar> oprs;
//vector<double> glyph_data;
//font.GetGlyphPath(char_code, oprs, glyph_data, false, 0);
itr.next()
}
print("\(str)")
}
default:
break
}
}
}
func ProcessImage(image: PTElement) {
let _: Bool = image.isImageMask() // image_mask
let _: Bool = image.isImageInterpolate() // interpolate
let width = image.getImageWidth()
let height = image.getImageHeight()
let out_data_sz = width * height * 3
print("Image: width=\"\(width)\" height=\"\(height)\"")
//let mtx: PTMatrix2D = image.getCTM() // image matrix (page positioning info)
// You can use GetImageData to read the raw (decoded) image data
//image.getBitsPerComponent()
//image.getImageData() // get raw image data
// .... or use Image2RGB filter that converts every image to RGB format,
// This should save you time since you don't need to deal with color conversions,
// image up-sampling, decoding etc.
let img_conv = PTImage2RGB(image_element: image) // Extract and convert image to RGB 8-bpc format
let reader: PTFilterReader = PTFilterReader(filter: img_conv)
// A buffer used to keep image data.
let _: Data = reader.read(UInt(out_data_sz)) // image_data_out
// &image_data_out.front() contains RGB image data.
// Note that you don't need to read a whole image at a time. Alternatively
// you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
// until the function returns 0.
}
func ProcessElements(reader: PTElementReader) {
while let element = reader.next() {
switch element.getType() {
case e_ptpath:
// Process path data...
ProcessPath(reader: reader, path: element)
case e_pttext_begin:
// Process text block...
ProcessText(page_reader: reader)
case e_ptform:
// Process form XObjects
reader.formBegin()
ProcessElements(reader: reader)
reader.end()
case e_ptimage:
// Process Images
ProcessImage(image: element)
default:
break
}
}
}
func runElementReaderAdvTest() -> Int {
return autoreleasepool {
var ret: Int = 0
do {
try PTPDFNet.catchException {
// Extract text data from all pages in the document
print("__________________________________________________")
print("Extract page element information from all ")
print("pages in the document.")
let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
doc.initSecurityHandler()
let page_begin: PTPageIterator = doc.getPageIterator(1)
let page_reader: PTElementReader = PTElementReader()
let itr: PTPageIterator = page_begin
while itr.hasNext() {
print("Page \(itr.current().getIndex())----------------------------------------")
page_reader.begin(itr.current())
ProcessElements(reader: page_reader)
page_reader.end()
itr.next()
}
print("Done.")
}
} catch let e as NSError {
print("\(e)")
ret = 1
}
return ret
}
}