Sample Java code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Android SDK and PDF Data Extraction SDK Capabilities.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.PDFNetException;
13import com.pdftron.pdf.Element;
14import com.pdftron.pdf.ElementReader;
15import com.pdftron.pdf.PDFDoc;
16import com.pdftron.pdf.Page;
17import com.pdftron.pdf.PageIterator;
18import com.pdftron.pdf.Rect;
19import com.pdftron.pdf.TextExtractor;
20
21import java.text.DecimalFormat;
22import java.util.ArrayList;
23
24public class TextExtractTest extends PDFNetSample {
25
26 private static OutputListener mOutputListener;
27
28 private static ArrayList<String> mFileList = new ArrayList<>();
29
30 public TextExtractTest() {
31 setTitle(R.string.sample_textextract_title);
32 setDescription(R.string.sample_textextract_description);
33 }
34
35 @Override
36 public void run(OutputListener outputListener) {
37 super.run(outputListener);
38 mOutputListener = outputListener;
39 mFileList.clear();
40 printHeader(outputListener);
41
42 // string output_path = "../../TestFiles/Output/";
43 boolean example1_basic = false;
44 boolean example2_xml = false;
45 boolean example3_wordlist = false;
46 boolean example4_advanced = true;
47 boolean example5_low_level = false;
48
49 // Sample code showing how to use high-level text extraction APIs.
50 try (PDFDoc doc = new PDFDoc(Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath())) {
51 doc.initSecurityHandler();
52
53 Page page = doc.getPage(1);
54 if (page == null) {
55 mOutputListener.println("Page not found.");
56 }
57
58 TextExtractor txt = new TextExtractor();
59 txt.begin(page); // Read the page.
60 // Other options you may want to consider...
61 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
62 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
63 // ...
64
65 // Example 1. Get all text on the page in a single string.
66 // Words will be separated with space or new line characters.
67 if (example1_basic) {
68 // Get the word count.
69 mOutputListener.println("Word Count: " + txt.getWordCount());
70
71 mOutputListener.println("\n\n- GetAsText --------------------------\n" + txt.getAsText());
72 mOutputListener.println("-----------------------------------------------------------");
73 }
74
75 // Example 2. Get XML logical structure for the page.
76 if (example2_xml) {
77 String text = txt.getAsXML(TextExtractor.e_words_as_elements | TextExtractor.e_output_bbox | TextExtractor.e_output_style_info);
78 mOutputListener.println("\n\n- GetAsXML --------------------------\n" + text);
79 mOutputListener.println("-----------------------------------------------------------");
80 }
81
82 // Example 3. Extract words one by one.
83 if (example3_wordlist) {
84 TextExtractor.Word word;
85 for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
86 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
87 mOutputListener.println(word.getString());
88 }
89 }
90 mOutputListener.println("-----------------------------------------------------------");
91 }
92
93 // Example 4. A more advanced text extraction example.
94 // The output is XML structure containing paragraphs, lines, words,
95 // as well as style and positioning information.
96 if (example4_advanced) {
97 Rect bbox;
98 int cur_flow_id = -1, cur_para_id = -1;
99
100 TextExtractor.Line line;
101 TextExtractor.Word word;
102 TextExtractor.Style s, line_style;
103
104 mOutputListener.println("<PDFText>");
105 // For each line on the page...
106 for (line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
107 if (line.getNumWords() == 0)
108 continue;
109 if (cur_flow_id != line.getFlowID()) {
110 if (cur_flow_id != -1) {
111 if (cur_para_id != -1) {
112 cur_para_id = -1;
113 mOutputListener.println("</Para>");
114 }
115 mOutputListener.println("</Flow>");
116 }
117 cur_flow_id = line.getFlowID();
118 mOutputListener.println("<Flow id=\"" + cur_flow_id + "\">");
119 }
120
121 if (cur_para_id != line.getParagraphID()) {
122 if (cur_para_id != -1)
123 mOutputListener.println("</Para>");
124 cur_para_id = line.getParagraphID();
125 mOutputListener.println("<Para id=\"" + cur_para_id + "\">");
126 }
127
128 bbox = line.getBBox();
129 line_style = line.getStyle();
130 mOutputListener.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
131 printStyle(line_style);
132 mOutputListener.println(" cur_num=\"" + line.getCurrentNum() + "\">");
133
134
135 // For each word in the line...
136 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
137 // Output the bounding box for the word.
138 bbox = word.getBBox();
139 mOutputListener.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
140 mOutputListener.print(" cur_num=\"" + word.getCurrentNum() + "\"");
141 int sz = word.getStringLen();
142 if (sz == 0) continue;
143
144 // If the word style is different from the parent style, output the new style.
145 s = word.getStyle();
146 if (!s.equals(line_style)) {
147 printStyle(s);
148 }
149
150 mOutputListener.print(">" + word.getString());
151 mOutputListener.println("</Word>");
152 }
153 mOutputListener.println("</Line>");
154 }
155
156 if (cur_flow_id != -1) {
157 if (cur_para_id != -1) {
158 cur_para_id = -1;
159 mOutputListener.println("</Para>");
160 }
161 mOutputListener.println("</Flow>");
162 }
163 }
164 txt.destroy();
165 mOutputListener.println("</PDFText>");
166 } catch (PDFNetException e) {
167 mOutputListener.printError(e.getStackTrace());
168 }
169
170 // Sample code showing how to use low-level text extraction APIs.
171 if (example5_low_level) {
172 try (PDFDoc doc = new PDFDoc((Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath()))) {
173 doc.initSecurityHandler();
174
175 // Example 1. Extract all text content from the document
176
177 ElementReader reader = new ElementReader();
178 // Read every page
179 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
180 reader.begin(itr.next());
181 DumpAllText(reader);
182 reader.end();
183 }
184
185 // Example 2. Extract text content based on the
186 // selection rectangle.
187 mOutputListener.print("\n----------------------------------------------------");
188 mOutputListener.print("\nExtract text based on the selection rectangle.");
189 mOutputListener.println("\n----------------------------------------------------");
190
191 Page first_page = doc.getPageIterator().next();
192 String s1 = ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
193 mOutputListener.print("\nField 1: " + s1);
194
195 s1 = ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
196 mOutputListener.print("\nField 2: " + s1);
197
198 s1 = ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
199 mOutputListener.print("\nField 3: " + s1);
200
201 // ...
202 mOutputListener.println("Done.");
203 } catch (Exception e) {
204 mOutputListener.printError(e.getStackTrace());
205 }
206 }
207
208 for (String file : mFileList) {
209 addToFileList(file);
210 }
211 printFooter(outputListener);
212 }
213
214
215 static void printStyle(TextExtractor.Style s) {
216 byte r = s.getColor()[0];
217 byte g = s.getColor()[1];
218 byte b = s.getColor()[2];
219 String rgb_hex = String.format("%02X%02X%02X;", r, g, b );
220 DecimalFormat df = new DecimalFormat("#.#");
221 mOutputListener.print(" style=\"font-family:" + s.getFontName() + "; "
222 + "font-size:" + df.format(s.getFontSize()) + ";"
223 + (s.isSerif() ? " sans-serif; " : " ")
224 + "color:#" + rgb_hex + "\"");
225 }
226
227 // A utility method used to dump all text content in the console window.
228 static void DumpAllText(ElementReader reader) throws PDFNetException {
229 Element element;
230 while ((element = reader.next()) != null) {
231 switch (element.getType()) {
232 case Element.e_text_begin:
233 mOutputListener.println("\n--> Text Block Begin");
234 break;
235 case Element.e_text_end:
236 mOutputListener.println("\n--> Text Block End");
237 break;
238 case Element.e_text: {
239 Rect bbox = element.getBBox();
240 if (bbox == null) continue;
241 mOutputListener.println("\n--> BBox: " + bbox.getX1() + ", "
242 + bbox.getY1() + ", "
243 + bbox.getX2() + ", "
244 + bbox.getY2());
245
246 String arr = element.getTextString();
247 mOutputListener.println(arr);
248 }
249 break;
250 case Element.e_text_new_line:
251 mOutputListener.println("\n--> New Line");
252 break;
253 case Element.e_form: // Process form XObjects
254 reader.formBegin();
255 DumpAllText(reader);
256 reader.end();
257 break;
258 }
259 }
260 }
261
262 // A helper method for ReadTextFromRect
263 static String RectTextSearch(ElementReader reader, Rect pos) throws PDFNetException {
264 Element element;
265 String srch_str = new String();
266 while ((element = reader.next()) != null) {
267 switch (element.getType()) {
268 case Element.e_text: {
269 Rect bbox = element.getBBox();
270 if (bbox == null) continue;
271 if (bbox.intersectRect(bbox, pos)) {
272 String arr = element.getTextString();
273 srch_str += arr;
274 srch_str += "\n"; // add a new line?
275 }
276 break;
277 }
278 case Element.e_text_new_line: {
279 break;
280 }
281 case Element.e_form: // Process form XObjects
282 {
283 reader.formBegin();
284 srch_str += RectTextSearch(reader, pos);
285 reader.end();
286 break;
287 }
288 }
289 }
290 return srch_str;
291 }
292
293 // A utility method used to extract all text content from
294 // a given selection rectangle. The rectangle coordinates are
295 // expressed in PDF user/page coordinate system.
296 static String ReadTextFromRect(Page page, Rect pos, ElementReader reader) throws PDFNetException {
297 reader.begin(page);
298 String srch_str = RectTextSearch(reader, pos);
299 reader.end();
300 return srch_str;
301 }
302
303}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample
10import com.pdftron.android.pdfnetsdksamples.R
11import com.pdftron.android.pdfnetsdksamples.util.Utils
12import com.pdftron.common.PDFNetException
13import com.pdftron.pdf.*
14import java.text.DecimalFormat
15import java.util.*
16
17class TextExtractTest : PDFNetSample() {
18 init {
19 setTitle(R.string.sample_textextract_title)
20 setDescription(R.string.sample_textextract_description)
21 }
22
23 override fun run(outputListener: OutputListener?) {
24 super.run(outputListener)
25 mOutputListener = outputListener
26 mFileList.clear()
27 printHeader(outputListener!!)
28
29 // string output_path = "../../TestFiles/Output/";
30 val example1_basic = false
31 val example2_xml = false
32 val example3_wordlist = false
33 val example4_advanced = true
34 val example5_low_level = false
35
36 // Sample code showing how to use high-level text extraction APIs.
37 try {
38 PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath).use { doc ->
39 doc.initSecurityHandler()
40
41 val page = doc.getPage(1)
42 if (page == null) {
43 mOutputListener!!.println("Page not found.")
44 }
45
46 val txt = TextExtractor()
47 txt.begin(page!!) // Read the page.
48 // Other options you may want to consider...
49 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
50 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
51 // ...
52
53 // Example 1. Get all text on the page in a single string.
54 // Words will be separated with space or new line characters.
55 if (example1_basic) {
56 // Get the word count.
57 mOutputListener!!.println("Word Count: " + txt.wordCount)
58
59 mOutputListener!!.println("\n\n- GetAsText --------------------------\n" + txt.asText)
60 mOutputListener!!.println("-----------------------------------------------------------")
61 }
62
63 // Example 2. Get XML logical structure for the page.
64 if (example2_xml) {
65 val text = txt.getAsXML(TextExtractor.e_words_as_elements or TextExtractor.e_output_bbox or TextExtractor.e_output_style_info)
66 mOutputListener!!.println("\n\n- GetAsXML --------------------------\n$text")
67 mOutputListener!!.println("-----------------------------------------------------------")
68 }
69
70 // Example 3. Extract words one by one.
71 if (example3_wordlist) {
72 var word: TextExtractor.Word
73 var line: TextExtractor.Line = txt.firstLine
74 while (line.isValid) {
75 word = line.firstWord
76 while (word.isValid) {
77 mOutputListener!!.println(word.string)
78 word = word.nextWord
79 }
80 line = line.nextLine
81 }
82 mOutputListener!!.println("-----------------------------------------------------------")
83 }
84
85 // Example 4. A more advanced text extraction example.
86 // The output is XML structure containing paragraphs, lines, words,
87 // as well as style and positioning information.
88 if (example4_advanced) {
89 var bbox: Rect
90 var cur_flow_id = -1
91 var cur_para_id = -1
92
93 var line: TextExtractor.Line
94 var word: TextExtractor.Word
95 var s: TextExtractor.Style
96 var line_style: TextExtractor.Style
97
98 mOutputListener!!.println("<PDFText>")
99 // For each line on the page...
100 line = txt.firstLine
101 while (line.isValid) {
102 if (line.numWords == 0) {
103 line = line.nextLine
104 continue
105 }
106 if (cur_flow_id != line.flowID) {
107 if (cur_flow_id != -1) {
108 if (cur_para_id != -1) {
109 cur_para_id = -1
110 mOutputListener!!.println("</Para>")
111 }
112 mOutputListener!!.println("</Flow>")
113 }
114 cur_flow_id = line.flowID
115 mOutputListener!!.println("<Flow id=\"$cur_flow_id\">")
116 }
117
118 if (cur_para_id != line.paragraphID) {
119 if (cur_para_id != -1)
120 mOutputListener!!.println("</Para>")
121 cur_para_id = line.paragraphID
122 mOutputListener!!.println("<Para id=\"$cur_para_id\">")
123 }
124
125 bbox = line.bBox
126 line_style = line.style
127 mOutputListener!!.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
128 printStyle(line_style)
129 mOutputListener!!.println(" cur_num=\"" + line.currentNum + "\">")
130
131 // For each word in the line...
132 word = line.firstWord
133 while (word.isValid) {
134 // Output the bounding box for the word.
135 bbox = word.bBox
136 mOutputListener!!.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.x1, bbox.y1, bbox.x2, bbox.y2) + "\"")
137 mOutputListener!!.print(" cur_num=\"" + word.currentNum + "\"")
138 val sz = word.stringLen
139 if (sz == 0) {
140 word = word.nextWord
141 continue
142 }
143
144 // If the word style is different from the parent style, output the new style.
145 s = word.style
146 if (s != line_style) {
147 printStyle(s)
148 }
149
150 mOutputListener!!.print(">" + word.string)
151 mOutputListener!!.println("</Word>")
152 word = word.nextWord
153 }
154 mOutputListener!!.println("</Line>")
155 line = line.nextLine
156 }
157
158 if (cur_flow_id != -1) {
159 if (cur_para_id != -1) {
160 cur_para_id = -1
161 mOutputListener!!.println("</Para>")
162 }
163 mOutputListener!!.println("</Flow>")
164 }
165 }
166 txt.destroy()
167 mOutputListener!!.println("</PDFText>")
168 }
169 } catch (e: PDFNetException) {
170 mOutputListener!!.printError(e.stackTrace)
171 }
172
173 // Sample code showing how to use low-level text extraction APIs.
174 if (example5_low_level) {
175 try {
176 PDFDoc(Utils.getAssetTempFile(PDFNetSample.INPUT_PATH + "newsletter.pdf")!!.absolutePath).use { doc ->
177 doc.initSecurityHandler()
178
179 // Example 1. Extract all text content from the document
180
181 val reader = ElementReader()
182 // Read every page
183 val itr = doc.pageIterator
184 while (itr.hasNext()) {
185 reader.begin(itr.next())
186 DumpAllText(reader)
187 reader.end()
188 }
189
190 // Example 2. Extract text content based on the
191 // selection rectangle.
192 mOutputListener!!.print("\n----------------------------------------------------")
193 mOutputListener!!.print("\nExtract text based on the selection rectangle.")
194 mOutputListener!!.println("\n----------------------------------------------------")
195
196 val first_page = doc.pageIterator.next()!!
197 var s1 = ReadTextFromRect(first_page, Rect(27.0, 392.0, 563.0, 534.0), reader)
198 mOutputListener!!.print("\nField 1: $s1")
199
200 s1 = ReadTextFromRect(first_page, Rect(28.0, 551.0, 106.0, 623.0), reader)
201 mOutputListener!!.print("\nField 2: $s1")
202
203 s1 = ReadTextFromRect(first_page, Rect(208.0, 550.0, 387.0, 621.0), reader)
204 mOutputListener!!.print("\nField 3: $s1")
205
206 // ...
207 mOutputListener!!.println("Done.")
208 }
209 } catch (e: Exception) {
210 mOutputListener!!.printError(e.stackTrace)
211 }
212
213 }
214
215 for (file in mFileList) {
216 addToFileList(file)
217 }
218 printFooter(outputListener)
219 }
220
221 companion object {
222
223 private var mOutputListener: OutputListener? = null
224
225 private val mFileList = ArrayList<String>()
226
227 internal fun printStyle(s: TextExtractor.Style) {
228 val r = s.color[0]
229 val g = s.color[1]
230 val b = s.color[2]
231 val rgb_hex = String.format("%02X%02X%02X;", r, g, b)
232 val df = DecimalFormat("#.#")
233 mOutputListener!!.print(" style=\"font-family:" + s.fontName + "; "
234 + "font-size:" + df.format(s.fontSize) + ";"
235 + (if (s.isSerif) " sans-serif; " else " ")
236 + "color:#" + rgb_hex + "\"")
237 }
238
239 // A utility method used to dump all text content in the console window.
240 @Throws(PDFNetException::class)
241 internal fun DumpAllText(reader: ElementReader) {
242 var element: Element?
243 while (true) {
244 element = reader.next()
245 if (element == null) {
246 break
247 }
248 when (element.type) {
249 Element.e_text_begin -> mOutputListener!!.println("\n--> Text Block Begin")
250 Element.e_text_end -> mOutputListener!!.println("\n--> Text Block End")
251 Element.e_text -> {
252 val bbox = element.bBox
253 if (bbox != null) {
254 mOutputListener!!.println("\n--> BBox: " + bbox.x1 + ", "
255 + bbox.y1 + ", "
256 + bbox.x2 + ", "
257 + bbox.y2)
258
259 val arr = element.textString
260 mOutputListener!!.println(arr)
261 }
262 }
263 Element.e_text_new_line -> mOutputListener!!.println("\n--> New Line")
264 Element.e_form // Process form XObjects
265 -> {
266 reader.formBegin()
267 DumpAllText(reader)
268 reader.end()
269 }
270 }
271 }
272 }
273
274 // A helper method for ReadTextFromRect
275 @Throws(PDFNetException::class)
276 internal fun RectTextSearch(reader: ElementReader, pos: Rect): String {
277 var element: Element?
278 var srch_str = String()
279 while (true) {
280 element = reader.next()
281 if (element == null) {
282 break
283 }
284 when (element.type) {
285 Element.e_text -> {
286 val bbox = element.bBox
287 if (bbox != null) {
288 if (bbox.intersectRect(bbox, pos)) {
289 val arr = element.textString
290 srch_str += arr
291 srch_str += "\n" // add a new line?
292 }
293 }
294 }
295 Element.e_text_new_line -> {
296 }
297 Element.e_form // Process form XObjects
298 -> {
299 reader.formBegin()
300 srch_str += RectTextSearch(reader, pos)
301 reader.end()
302 }
303 }
304 }
305 return srch_str
306 }
307
308 // A utility method used to extract all text content from
309 // a given selection rectangle. The rectangle coordinates are
310 // expressed in PDF user/page coordinate system.
311 @Throws(PDFNetException::class)
312 internal fun ReadTextFromRect(page: Page, pos: Rect, reader: ElementReader): String {
313 reader.begin(page)
314 val srch_str = RectTextSearch(reader, pos)
315 reader.end()
316 return srch_str
317 }
318 }
319
320}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales