Some test text!

Search
Hamburger Icon

Read a PDF file in Swift (parse & extract text)

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample Swift code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Swift PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------

import PDFNet
import Foundation

// This sample illustrates the basic text extraction capabilities of PDFNet.

// A utility method used to dump all text content in the console window.
func DumpAllText(_ reader: PTElementReader) {
    while let element = reader.next() {
        switch element.getType() {
        case e_pttext_begin:
            print("--> Text Block Begin")
        case e_pttext_end:
            print("--> Text Block End")
        case e_pttext_obj:
            let bbox: PTPDFRect = element.getBBox()
            print("--> BBox: \(bbox.getX1()), \(bbox.getY1()), \(bbox.getX2()), \(bbox.getY2())")
            print("\(element.getTextString()!)")
        case e_pttext_new_line:
            print("--> New Line")
        case e_ptform:
            // Process form XObjects
            reader.formBegin()
            DumpAllText(reader)
            reader.end()
        default:
            break
        }
    }
}

// A helper method for ReadTextFromRect
func RectTextSearch(reader: PTElementReader, pos: PTPDFRect, srch_str: inout String) {
    while let element = reader.next() {
        switch element.getType() {
        case e_pttext_obj:
            let bbox: PTPDFRect = element.getBBox()
            if bbox.intersect(bbox, rect2: pos) {
                let arr = element.getTextString()
                srch_str += (arr ?? "")
                srch_str += ("\n")    // add a new line?
            }
        case e_pttext_new_line:
            break
        case e_ptform:
            // Process form XObjects
            reader.formBegin()
            RectTextSearch(reader: reader, pos: pos, srch_str: &srch_str)
            reader.end()
        default:
            break
        }
    }
}

// A utility method used to extract all text content from
// a given selection rectangle. The rectangle coordinates are
// expressed in PDF user/page coordinate system.
func ReadTextFromRect(page: PTPage, pos: PTPDFRect, reader: PTElementReader) -> String {
    var srch_str = ""
    reader.begin(page)
    RectTextSearch(reader: reader, pos: pos, srch_str: &srch_str)
    reader.end()
    return srch_str
}

func PrintStyle(_ s: PTTextExtractorStyle) {
    let rgb: NSMutableArray = s.getColor()
    print(" style=\"font-family:\(s.getFontName()!); font-size:\(s.getFontSize()); sans-serif: \(s.isSerif()); color: #\(rgb[0]), \(rgb[1]), \(rgb[2])\"")
}

func runTextExtractTest() -> Int {
    return autoreleasepool {
        var ret = 0
        
        
        let example1_basic = true
        let example2_xml = true
        let example3_wordlist = true
        let example4_advanced = true
        let example5_low_level = false
        
        // Sample code showing how to use high-level text extraction APIs.
        do {
            try PTPDFNet.catchException {
                let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
                doc.initSecurityHandler()
                
                guard let page: PTPage = doc.getPage(1) else {
                    print("Page not found.")
                    ret = 1
                    return
                }
                
                let txt: PTTextExtractor = PTTextExtractor()
                txt.begin(page, clip_ptr: nil, flags: 0)    // Read the page.
                // Other options you may want to consider...
                // txt.begin(page, nil, e_ptno_dup_remove);
                // txt.begin(page, nil, e_ptremove_hidden_text);
                
                // Example 1. Get all text on the page in a single string.
                // Words will be separated with space or new line characters.
                if example1_basic {
                    // Get the word count.
                    print("Word Count: \(txt.getWordCount())")
                    
                    let text: String = txt.getAsText(true)
                    print("\n\n- GetAsText --------------------------\n\(text)")
                    print("-----------------------------------------------------------")
                }
                
                // Example 2. Get XML logical structure for the page.
                if example2_xml {
                    let text: String = txt.getAsXML(e_ptwords_as_elements.rawValue | e_ptoutput_bbox.rawValue | e_ptoutput_style_info.rawValue)
                    print("\n\n- GetAsXML  --------------------------\n\(text)")
                    print("-----------------------------------------------------------")
                }
                
                // Example 3. Extract words one by one.
                if example3_wordlist {
                    var line: PTTextExtractorLine = txt.getFirstLine()
                    while line.isValid() {
                        var word: PTWord = line.getFirstWord()
                        while word.isValid() {
                            print("\(word.getString()!)")
                            word = word.getNext()
                        }
                        line = line.getNext()
                    }
                    print("-----------------------------------------------------------")
                }
                
                // Example 4. A more advanced text extraction example.
                // The output is XML structure containing paragraphs, lines, words,
                // as well as style and positioning information.
                if example4_advanced {
                    var b: PTPDFRect
                    var q: PTPDFRect
                    var cur_flow_id = -1
                    var cur_para_id = -1
                    
                    var uni_str = ""
                    var line: PTTextExtractorLine
                    var word: PTWord
                    var s: PTTextExtractorStyle
                    var line_style: PTTextExtractorStyle
                    
                    // For each line on the page...
                    line = txt.getFirstLine()
                    while line.isValid() {
                        if line.getNumWords() == 0 {
                            continue
                        }
                        if cur_flow_id != line.getFlowID() {
                            if cur_flow_id != -1 {
                                if cur_para_id != -1 {
                                    cur_para_id = -1
                                    print("</Para>")
                                }
                                print("</Flow>\n")
                            }
                            cur_flow_id = Int(line.getFlowID())
                            print("<Flow id=\", \(cur_flow_id)\">\n")
                        }
                        if cur_para_id != line.getParagraphID() {
                            if cur_para_id != -1 {
                                print("</Para>\n")
                            }
                            cur_para_id = Int(line.getParagraphID())
                            print("<Para id=\", \(cur_para_id)\">\n")
                        }
                        
                        b = line.getBBox()
                        line_style = line.getStyle()
                        print("<Line box=\"\(b.getX1()), \(b.getY1()), \(b.getX2()), \(b.getY2())\"")
                        PrintStyle(line_style)
                        print(">\n")
                        
                        // For each word in the line...
                        word = line.getFirstWord()
                        while word.isValid() {
                            // Output the bounding box for the word.
                            q = word.getBBox()
                            print("<Word box=\"\(q.getX1()), \(q.getY1()), \(q.getX2()), \(q.getY2())\"")
                            let sz = word.getStringLen()
                            if sz == 0 {
                                continue
                            }
                            
                            // If the word style is different from the parent style, output the new style.
                            s = word.getStyle()
                            if s != line_style {
                                PrintStyle(s)
                            }
                            
                            uni_str = word.getString()
                            print(">\(uni_str)")
                            print("</Word>\n")
                            word = word.getNext()
                        }
                        print("</Line>\n")
                        line = line.getNext()
                    }
                    if cur_flow_id != -1 {
                        if cur_para_id != -1 {
                            cur_para_id = -1
                            print("</Para>\n")
                        }
                        print("</Flow>\n")
                    }
                }
            }
        } catch let e as NSError {
            print("\(e)")
            ret = 1
        }
        
        if example5_low_level {
//            do {
//                try PTPDFNet.catchException {
//                    let doc: PTPDFDoc = PTPDFDoc(filepath: Bundle.main.path(forResource: "newsletter", ofType: "pdf"))
//                    doc.initSecurityHandler()
//
//                    // Example 1. Extract all text content from the document
//
//                    let reader: PTElementReader = PTElementReader()
//                    //  Read every page
//                    let itr: PTPageIterator = doc.getPageIterator(1)
//                    while itr.hasNext() {
//                        reader.begin(itr.current())
//                        DumpAllText(reader)
//                        reader.end()
//                        itr.next()
//                    }
//
//                    // Example 2. Extract text content based on the
//                    // selection rectangle.
//                    print("\n----------------------------------------------------")
//                    print("\nExtract text based on the selection rectangle.")
//                    print("\n----------------------------------------------------\n")
//
//                    let first_page: PTPage = doc.getPageIterator(1).current()
//                    let rect1: PTPDFRect = PTPDFRect(x1: 27, y1: 392, x2: 563, y2: 534)
//                    var s1: String = ReadTextFromRect(page: first_page, pos: rect1, reader: reader)
//                    print("\nField 1: \(s1)")
//
//                    let rect2: PTPDFRect = PTPDFRect(x1: 28, y1: 551, x2: 106, y2: 623)
//                    s1 = ReadTextFromRect(page: first_page, pos: rect2, reader: reader)
//                    print("\nField 2: \(s1)")
//
//                    let rect3: PTPDFRect = PTPDFRect(x1: 208, y1: 550, x2: 387, y2: 621)
//                    s1 = ReadTextFromRect(page: first_page, pos: rect3, reader: reader)
//                    print("\nField 3: \(s1)")
//
//                    // ...
//                    print("Done.")
//                }
//            } catch let e as NSError {
//                print("\(e)")
//                ret = 1
//            }
        }
    
        return ret
    }
}