TextExtract

Sample Java code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Android SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.PDFNetException;
13import com.pdftron.pdf.Element;
14import com.pdftron.pdf.ElementReader;
15import com.pdftron.pdf.PDFDoc;
16import com.pdftron.pdf.Page;
17import com.pdftron.pdf.PageIterator;
18import com.pdftron.pdf.Rect;
19import com.pdftron.pdf.TextExtractor;
20
21import java.text.DecimalFormat;
22import java.util.ArrayList;
23
24public class TextExtractTest extends PDFNetSample {
25
26	private static OutputListener mOutputListener;
27
28	private static ArrayList<String> mFileList = new ArrayList<>();
29
30    public TextExtractTest() {
31        setTitle(R.string.sample_textextract_title);
32        setDescription(R.string.sample_textextract_description);
33    }
34
35	@Override
36	public void run(OutputListener outputListener) {
37		super.run(outputListener);
38		mOutputListener = outputListener;
39		mFileList.clear();
40		printHeader(outputListener);
41
42        // string output_path = "../../TestFiles/Output/";
43        boolean example1_basic = false;
44        boolean example2_xml = false;
45        boolean example3_wordlist = false;
46        boolean example4_advanced = true;
47        boolean example5_low_level = false;
48
49        // Sample code showing how to use high-level text extraction APIs.
50        try (PDFDoc doc = new PDFDoc(Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath())) {
51            doc.initSecurityHandler();
52
53            Page page = doc.getPage(1);
54            if (page == null) {
55                mOutputListener.println("Page not found.");
56            }
57
58            TextExtractor txt = new TextExtractor();
59            txt.begin(page);  // Read the page.
60            // Other options you may want to consider...
61            // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
62            // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
63            // ...
64
65            // Example 1. Get all text on the page in a single string.
66            // Words will be separated with space or new line characters.
67            if (example1_basic) {
68                // Get the word count.
69                mOutputListener.println("Word Count: " + txt.getWordCount());
70
71                mOutputListener.println("\n\n- GetAsText --------------------------\n" + txt.getAsText());
72                mOutputListener.println("-----------------------------------------------------------");
73            }
74
75            // Example 2. Get XML logical structure for the page.
76            if (example2_xml) {
77                String text = txt.getAsXML(TextExtractor.e_words_as_elements | TextExtractor.e_output_bbox | TextExtractor.e_output_style_info);
78                mOutputListener.println("\n\n- GetAsXML  --------------------------\n" + text);
79                mOutputListener.println("-----------------------------------------------------------");
80            }
81
82            // Example 3. Extract words one by one.
83            if (example3_wordlist) {
84                TextExtractor.Word word;
85                for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
86                    for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
87                        mOutputListener.println(word.getString());
88                    }
89                }
90                mOutputListener.println("-----------------------------------------------------------");
91            }
92
93            // Example 4. A more advanced text extraction example.
94            // The output is XML structure containing paragraphs, lines, words,
95            // as well as style and positioning information.
96            if (example4_advanced) {
97                Rect bbox;
98                int cur_flow_id = -1, cur_para_id = -1;
99
100                TextExtractor.Line line;
101                TextExtractor.Word word;
102                TextExtractor.Style s, line_style;
103
104                mOutputListener.println("<PDFText>");
105                // For each line on the page...
106                for (line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
107                    if (line.getNumWords() == 0)
108                        continue;
109                    if (cur_flow_id != line.getFlowID()) {
110                        if (cur_flow_id != -1) {
111                            if (cur_para_id != -1) {
112                                cur_para_id = -1;
113                                mOutputListener.println("</Para>");
114                            }
115                            mOutputListener.println("</Flow>");
116                        }
117                        cur_flow_id = line.getFlowID();
118                        mOutputListener.println("<Flow id=\"" + cur_flow_id + "\">");
119                    }
120
121                    if (cur_para_id != line.getParagraphID()) {
122                        if (cur_para_id != -1)
123                            mOutputListener.println("</Para>");
124                        cur_para_id = line.getParagraphID();
125                        mOutputListener.println("<Para id=\"" + cur_para_id + "\">");
126                    }
127
128                    bbox = line.getBBox();
129                    line_style = line.getStyle();
130                    mOutputListener.print("<Line box=\"" +  String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
131                    printStyle(line_style);
132                    mOutputListener.println(" cur_num=\"" + line.getCurrentNum() + "\">");
133                 
134
135                    // For each word in the line...
136                    for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
137                        // Output the bounding box for the word.
138                        bbox = word.getBBox();
139                        mOutputListener.print("<Word box=\"" +  String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
140                        mOutputListener.print(" cur_num=\"" + word.getCurrentNum() + "\"");
141                        int sz = word.getStringLen();
142                        if (sz == 0) continue;
143
144                        // If the word style is different from the parent style, output the new style.
145                        s = word.getStyle();
146                        if (!s.equals(line_style)) {
147                            printStyle(s);
148                        }
149
150                        mOutputListener.print(">" + word.getString());
151                        mOutputListener.println("</Word>");
152                    }
153                    mOutputListener.println("</Line>");
154                }
155
156                if (cur_flow_id != -1) {
157                    if (cur_para_id != -1) {
158                        cur_para_id = -1;
159                        mOutputListener.println("</Para>");
160                    }
161                    mOutputListener.println("</Flow>");
162                }
163            }
164            txt.destroy();
165            mOutputListener.println("</PDFText>");
166        } catch (PDFNetException e) {
167            mOutputListener.printError(e.getStackTrace());
168        }
169
170        // Sample code showing how to use low-level text extraction APIs.
171        if (example5_low_level) {
172            try (PDFDoc doc = new PDFDoc((Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath()))) {
173                doc.initSecurityHandler();
174
175                // Example 1. Extract all text content from the document
176
177                ElementReader reader = new ElementReader();
178                //  Read every page
179                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
180                    reader.begin(itr.next());
181                    DumpAllText(reader);
182                    reader.end();
183                }
184
185                // Example 2. Extract text content based on the
186                // selection rectangle.
187                mOutputListener.print("\n----------------------------------------------------");
188                mOutputListener.print("\nExtract text based on the selection rectangle.");
189                mOutputListener.println("\n----------------------------------------------------");
190
191                Page first_page = doc.getPageIterator().next();
192                String s1 = ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
193                mOutputListener.print("\nField 1: " + s1);
194
195                s1 = ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
196                mOutputListener.print("\nField 2: " + s1);
197
198                s1 = ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
199                mOutputListener.print("\nField 3: " + s1);
200
201                // ...
202                mOutputListener.println("Done.");
203            } catch (Exception e) {
204                mOutputListener.printError(e.getStackTrace());
205            }
206        }
207
208		for (String file : mFileList) {
209			addToFileList(file);
210		}
211		printFooter(outputListener);
212	}
213
214
215    static void printStyle(TextExtractor.Style s) {
216        byte r = s.getColor()[0];
217		byte g = s.getColor()[1];
218		byte b = s.getColor()[2];
219        String rgb_hex =  String.format("%02X%02X%02X;", r, g, b );
220        DecimalFormat df = new DecimalFormat("#.#");
221        mOutputListener.print(" style=\"font-family:" + s.getFontName() + "; "
222                + "font-size:" + df.format(s.getFontSize()) + ";"
223                + (s.isSerif() ? " sans-serif; " : " ")
224                + "color:#" + rgb_hex + "\"");
225    }
226
227    // A utility method used to dump all text content in the console window.
228    static void DumpAllText(ElementReader reader) throws PDFNetException {
229        Element element;
230        while ((element = reader.next()) != null) {
231            switch (element.getType()) {
232                case Element.e_text_begin:
233                    mOutputListener.println("\n--> Text Block Begin");
234                    break;
235                case Element.e_text_end:
236                    mOutputListener.println("\n--> Text Block End");
237                    break;
238                case Element.e_text: {
239                    Rect bbox = element.getBBox();
240                    if (bbox == null) continue;
241                    mOutputListener.println("\n--> BBox: " + bbox.getX1() + ", "
242                            + bbox.getY1() + ", "
243                            + bbox.getX2() + ", "
244                            + bbox.getY2());
245
246                    String arr = element.getTextString();
247                    mOutputListener.println(arr);
248                }
249                break;
250                case Element.e_text_new_line:
251                    mOutputListener.println("\n--> New Line");
252                    break;
253                case Element.e_form:                // Process form XObjects
254                    reader.formBegin();
255                    DumpAllText(reader);
256                    reader.end();
257                    break;
258            }
259        }
260    }
261
262    // A helper method for ReadTextFromRect
263    static String RectTextSearch(ElementReader reader, Rect pos) throws PDFNetException {
264        Element element;
265        String srch_str = new String();
266        while ((element = reader.next()) != null) {
267            switch (element.getType()) {
268                case Element.e_text: {
269                    Rect bbox = element.getBBox();
270                    if (bbox == null) continue;
271                    if (bbox.intersectRect(bbox, pos)) {
272                        String arr = element.getTextString();
273                        srch_str += arr;
274                        srch_str += "\n"; // add a new line?
275                    }
276                    break;
277                }
278                case Element.e_text_new_line: {
279                    break;
280                }
281                case Element.e_form: // Process form XObjects
282                {
283                    reader.formBegin();
284                    srch_str += RectTextSearch(reader, pos);
285                    reader.end();
286                    break;
287                }
288            }
289        }
290        return srch_str;
291    }
292
293    // A utility method used to extract all text content from
294    // a given selection rectangle. The rectangle coordinates are
295    // expressed in PDF user/page coordinate system.
296    static String ReadTextFromRect(Page page, Rect pos, ElementReader reader) throws PDFNetException {
297        reader.begin(page);
298        String srch_str = RectTextSearch(reader, pos);
299        reader.end();
300        return srch_str;
301    }
302
303}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample
10import com.pdftron.android.pdfnetsdksamples.R
11import com.pdftron.android.pdfnetsdksamples.util.Utils
12import com.pdftron.common.PDFNetException
13import com.pdftron.pdf.*
14import java.text.DecimalFormat
15import java.util.*
16
17class TextExtractTest : PDFNetSample() {
18    init {
19        setTitle(R.string.sample_textextract_title)
20        setDescription(R.string.sample_textextract_description)
21    }
22
23    override fun run(outputListener: OutputListener?) {
24        super.run(outputListener)
25        mOutputListener = outputListener
26        mFileList.clear()
27        printHeader(outputListener!!)
28
29        // string output_path = "../../TestFiles/Output/";
30        val example1_basic = false
31        val example2_xml = false
32        val example3_wordlist = false
33        val example4_advanced = true
34        val example5_low_level = false
35
36        // Sample code showing how to use high-level text extraction APIs.
37        try {
38            PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath).use { doc ->
39                doc.initSecurityHandler()
40
41                val page = doc.getPage(1)
42                if (page == null) {
43                    mOutputListener!!.println("Page not found.")
44                }
45
46                val txt = TextExtractor()
47                txt.begin(page!!)  // Read the page.
48                // Other options you may want to consider...
49                // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
50                // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
51                // ...
52
53                // Example 1. Get all text on the page in a single string.
54                // Words will be separated with space or new line characters.
55                if (example1_basic) {
56                    // Get the word count.
57                    mOutputListener!!.println("Word Count: " + txt.wordCount)
58
59                    mOutputListener!!.println("\n\n- GetAsText --------------------------\n" + txt.asText)
60                    mOutputListener!!.println("-----------------------------------------------------------")
61                }
62
63                // Example 2. Get XML logical structure for the page.
64                if (example2_xml) {
65                    val text = txt.getAsXML(TextExtractor.e_words_as_elements or TextExtractor.e_output_bbox or TextExtractor.e_output_style_info)
66                    mOutputListener!!.println("\n\n- GetAsXML  --------------------------\n$text")
67                    mOutputListener!!.println("-----------------------------------------------------------")
68                }
69
70                // Example 3. Extract words one by one.
71                if (example3_wordlist) {
72                    var word: TextExtractor.Word
73                    var line: TextExtractor.Line = txt.firstLine
74                    while (line.isValid) {
75                        word = line.firstWord
76                        while (word.isValid) {
77                            mOutputListener!!.println(word.string)
78                            word = word.nextWord
79                        }
80                        line = line.nextLine
81                    }
82                    mOutputListener!!.println("-----------------------------------------------------------")
83                }
84
85                // Example 4. A more advanced text extraction example.
86                // The output is XML structure containing paragraphs, lines, words,
87                // as well as style and positioning information.
88                if (example4_advanced) {
89                    var bbox: Rect
90                    var cur_flow_id = -1
91                    var cur_para_id = -1
92
93                    var line: TextExtractor.Line
94                    var word: TextExtractor.Word
95                    var s: TextExtractor.Style
96                    var line_style: TextExtractor.Style
97
98                    mOutputListener!!.println("<PDFText>")
99                    // For each line on the page...
100                    line = txt.firstLine
101                    while (line.isValid) {
102                        if (line.numWords == 0) {
103                            line = line.nextLine
104                            continue
105                        }
106                        if (cur_flow_id != line.flowID) {
107                            if (cur_flow_id != -1) {
108                                if (cur_para_id != -1) {
109                                    cur_para_id = -1
110                                    mOutputListener!!.println("</Para>")
111                                }
112                                mOutputListener!!.println("</Flow>")
113                            }
114                            cur_flow_id = line.flowID
115                            mOutputListener!!.println("<Flow id=\"$cur_flow_id\">")
116                        }
117
118                        if (cur_para_id != line.paragraphID) {
119                            if (cur_para_id != -1)
120                                mOutputListener!!.println("</Para>")
121                            cur_para_id = line.paragraphID
122                            mOutputListener!!.println("<Para id=\"$cur_para_id\">")
123                        }
124
125                        bbox = line.bBox
126                        line_style = line.style
127                        mOutputListener!!.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
128                        printStyle(line_style)
129                        mOutputListener!!.println(" cur_num=\"" + line.currentNum + "\">")
130
131                        // For each word in the line...
132                        word = line.firstWord
133                        while (word.isValid) {
134                            // Output the bounding box for the word.
135                            bbox = word.bBox
136                            mOutputListener!!.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
137                            mOutputListener!!.print(" cur_num=\"" + word.currentNum + "\"")
138                            val sz = word.stringLen
139                            if (sz == 0) {
140                                word = word.nextWord
141                                continue
142                            }
143
144                            // If the word style is different from the parent style, output the new style.
145                            s = word.style
146                            if (s != line_style) {
147                                printStyle(s)
148                            }
149
150                            mOutputListener!!.print(">" + word.string)
151                            mOutputListener!!.println("</Word>")
152                            word = word.nextWord
153                        }
154                        mOutputListener!!.println("</Line>")
155                        line = line.nextLine
156                    }
157
158                    if (cur_flow_id != -1) {
159                        if (cur_para_id != -1) {
160                            cur_para_id = -1
161                            mOutputListener!!.println("</Para>")
162                        }
163                        mOutputListener!!.println("</Flow>")
164                    }
165                }
166                txt.destroy()
167                mOutputListener!!.println("</PDFText>")
168            }
169        } catch (e: PDFNetException) {
170            mOutputListener!!.printError(e.stackTrace)
171        }
172
173        // Sample code showing how to use low-level text extraction APIs.
174        if (example5_low_level) {
175            try {
176                PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath).use { doc ->
177                    doc.initSecurityHandler()
178
179                    // Example 1. Extract all text content from the document
180
181                    val reader = ElementReader()
182                    //  Read every page
183                    val itr = doc.pageIterator
184                    while (itr.hasNext()) {
185                        reader.begin(itr.next())
186                        DumpAllText(reader)
187                        reader.end()
188                    }
189
190                    // Example 2. Extract text content based on the
191                    // selection rectangle.
192                    mOutputListener!!.print("\n----------------------------------------------------")
193                    mOutputListener!!.print("\nExtract text based on the selection rectangle.")
194                    mOutputListener!!.println("\n----------------------------------------------------")
195
196                    val first_page = doc.pageIterator.next()!!
197                    var s1 = ReadTextFromRect(first_page, Rect(27.0, 392.0, 563.0, 534.0), reader)
198                    mOutputListener!!.print("\nField 1: $s1")
199
200                    s1 = ReadTextFromRect(first_page, Rect(28.0, 551.0, 106.0, 623.0), reader)
201                    mOutputListener!!.print("\nField 2: $s1")
202
203                    s1 = ReadTextFromRect(first_page, Rect(208.0, 550.0, 387.0, 621.0), reader)
204                    mOutputListener!!.print("\nField 3: $s1")
205
206                    // ...
207                    mOutputListener!!.println("Done.")
208                }
209            } catch (e: Exception) {
210                mOutputListener!!.printError(e.stackTrace)
211            }
212
213        }
214
215        for (file in mFileList) {
216            addToFileList(file)
217        }
218        printFooter(outputListener)
219    }
220
221    companion object {
222
223        private var mOutputListener: OutputListener? = null
224
225        private val mFileList = ArrayList<String>()
226
227        internal fun printStyle(s: TextExtractor.Style) {
228            val r = s.color[0]
229            val g = s.color[1]
230            val b = s.color[2]
231            val rgb_hex = String.format("%02X%02X%02X;", r, g, b)
232            val df = DecimalFormat("#.#")
233            mOutputListener!!.print(" style=\"font-family:" + s.fontName + "; "
234                    + "font-size:" + df.format(s.fontSize) + ";"
235                    + (if (s.isSerif) " sans-serif; " else " ")
236                    + "color:#" + rgb_hex + "\"")
237        }
238
239        // A utility method used to dump all text content in the console window.
240        @Throws(PDFNetException::class)
241        internal fun DumpAllText(reader: ElementReader) {
242            var element: Element?
243            while (true) {
244                element = reader.next()
245                if (element == null) {
246                    break
247                }
248                when (element.type) {
249                    Element.e_text_begin -> mOutputListener!!.println("\n--> Text Block Begin")
250                    Element.e_text_end -> mOutputListener!!.println("\n--> Text Block End")
251                    Element.e_text -> {
252                        val bbox = element.bBox
253                        if (bbox != null) {
254                            mOutputListener!!.println("\n--> BBox: " + bbox.x1 + ", "
255                                    + bbox.y1 + ", "
256                                    + bbox.x2 + ", "
257                                    + bbox.y2)
258
259                            val arr = element.textString
260                            mOutputListener!!.println(arr)
261                        }
262                    }
263                    Element.e_text_new_line -> mOutputListener!!.println("\n--> New Line")
264                    Element.e_form                // Process form XObjects
265                    -> {
266                        reader.formBegin()
267                        DumpAllText(reader)
268                        reader.end()
269                    }
270                }
271            }
272        }
273
274        // A helper method for ReadTextFromRect
275        @Throws(PDFNetException::class)
276        internal fun RectTextSearch(reader: ElementReader, pos: Rect): String {
277            var element: Element?
278            var srch_str = String()
279            while (true) {
280                element = reader.next()
281                if (element == null) {
282                    break
283                }
284                when (element.type) {
285                    Element.e_text -> {
286                        val bbox = element.bBox
287                        if (bbox != null) {
288                            if (bbox.intersectRect(bbox, pos)) {
289                                val arr = element.textString
290                                srch_str += arr
291                                srch_str += "\n" // add a new line?
292                            }
293                        }
294                    }
295                    Element.e_text_new_line -> {
296                    }
297                    Element.e_form // Process form XObjects
298                    -> {
299                        reader.formBegin()
300                        srch_str += RectTextSearch(reader, pos)
301                        reader.end()
302                    }
303                }
304            }
305            return srch_str
306        }
307
308        // A utility method used to extract all text content from
309        // a given selection rectangle. The rectangle coordinates are
310        // expressed in PDF user/page coordinate system.
311        @Throws(PDFNetException::class)
312        internal fun ReadTextFromRect(page: Page, pos: Rect, reader: ElementReader): String {
313            reader.begin(page)
314            val srch_str = RectTextSearch(reader, pos)
315            reader.end()
316            return srch_str
317        }
318    }
319
320}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

TextExtract