TextExtract

Sample Java code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Android SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.PDFNetException;
13import com.pdftron.pdf.Element;
14import com.pdftron.pdf.ElementReader;
15import com.pdftron.pdf.PDFDoc;
16import com.pdftron.pdf.Page;
17import com.pdftron.pdf.PageIterator;
18import com.pdftron.pdf.Rect;
19import com.pdftron.pdf.TextExtractor;
20
21import java.text.DecimalFormat;
22import java.util.ArrayList;
23
24public class TextExtractTest extends PDFNetSample {
25
26 private static OutputListener mOutputListener;
27
28 private static ArrayList<String> mFileList = new ArrayList<>();
29
30 public TextExtractTest() {
31 setTitle(R.string.sample_textextract_title);
32 setDescription(R.string.sample_textextract_description);
33 }
34
35 @Override
36 public void run(OutputListener outputListener) {
37 super.run(outputListener);
38 mOutputListener = outputListener;
39 mFileList.clear();
40 printHeader(outputListener);
41
42 // string output_path = "../../TestFiles/Output/";
43 boolean example1_basic = false;
44 boolean example2_xml = false;
45 boolean example3_wordlist = false;
46 boolean example4_advanced = true;
47 boolean example5_low_level = false;
48
49 // Sample code showing how to use high-level text extraction APIs.
50 try (PDFDoc doc = new PDFDoc(Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath())) {
51 doc.initSecurityHandler();
52
53 Page page = doc.getPage(1);
54 if (page == null) {
55 mOutputListener.println("Page not found.");
56 }
57
58 TextExtractor txt = new TextExtractor();
59 txt.begin(page); // Read the page.
60 // Other options you may want to consider...
61 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
62 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
63 // ...
64
65 // Example 1. Get all text on the page in a single string.
66 // Words will be separated with space or new line characters.
67 if (example1_basic) {
68 // Get the word count.
69 mOutputListener.println("Word Count: " + txt.getWordCount());
70
71 mOutputListener.println("\n\n- GetAsText --------------------------\n" + txt.getAsText());
72 mOutputListener.println("-----------------------------------------------------------");
73 }
74
75 // Example 2. Get XML logical structure for the page.
76 if (example2_xml) {
77 String text = txt.getAsXML(TextExtractor.e_words_as_elements | TextExtractor.e_output_bbox | TextExtractor.e_output_style_info);
78 mOutputListener.println("\n\n- GetAsXML --------------------------\n" + text);
79 mOutputListener.println("-----------------------------------------------------------");
80 }
81
82 // Example 3. Extract words one by one.
83 if (example3_wordlist) {
84 TextExtractor.Word word;
85 for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
86 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
87 mOutputListener.println(word.getString());
88 }
89 }
90 mOutputListener.println("-----------------------------------------------------------");
91 }
92
93 // Example 4. A more advanced text extraction example.
94 // The output is XML structure containing paragraphs, lines, words,
95 // as well as style and positioning information.
96 if (example4_advanced) {
97 Rect bbox;
98 int cur_flow_id = -1, cur_para_id = -1;
99
100 TextExtractor.Line line;
101 TextExtractor.Word word;
102 TextExtractor.Style s, line_style;
103
104 mOutputListener.println("<PDFText>");
105 // For each line on the page...
106 for (line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
107 if (line.getNumWords() == 0)
108 continue;
109 if (cur_flow_id != line.getFlowID()) {
110 if (cur_flow_id != -1) {
111 if (cur_para_id != -1) {
112 cur_para_id = -1;
113 mOutputListener.println("</Para>");
114 }
115 mOutputListener.println("</Flow>");
116 }
117 cur_flow_id = line.getFlowID();
118 mOutputListener.println("<Flow id=\"" + cur_flow_id + "\">");
119 }
120
121 if (cur_para_id != line.getParagraphID()) {
122 if (cur_para_id != -1)
123 mOutputListener.println("</Para>");
124 cur_para_id = line.getParagraphID();
125 mOutputListener.println("<Para id=\"" + cur_para_id + "\">");
126 }
127
128 bbox = line.getBBox();
129 line_style = line.getStyle();
130 mOutputListener.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
131 printStyle(line_style);
132 mOutputListener.println(" cur_num=\"" + line.getCurrentNum() + "\">");
133
134
135 // For each word in the line...
136 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
137 // Output the bounding box for the word.
138 bbox = word.getBBox();
139 mOutputListener.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
140 mOutputListener.print(" cur_num=\"" + word.getCurrentNum() + "\"");
141 int sz = word.getStringLen();
142 if (sz == 0) continue;
143
144 // If the word style is different from the parent style, output the new style.
145 s = word.getStyle();
146 if (!s.equals(line_style)) {
147 printStyle(s);
148 }
149
150 mOutputListener.print(">" + word.getString());
151 mOutputListener.println("</Word>");
152 }
153 mOutputListener.println("</Line>");
154 }
155
156 if (cur_flow_id != -1) {
157 if (cur_para_id != -1) {
158 cur_para_id = -1;
159 mOutputListener.println("</Para>");
160 }
161 mOutputListener.println("</Flow>");
162 }
163 }
164 txt.destroy();
165 mOutputListener.println("</PDFText>");
166 } catch (PDFNetException e) {
167 mOutputListener.printError(e.getStackTrace());
168 }
169
170 // Sample code showing how to use low-level text extraction APIs.
171 if (example5_low_level) {
172 try (PDFDoc doc = new PDFDoc((Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath()))) {
173 doc.initSecurityHandler();
174
175 // Example 1. Extract all text content from the document
176
177 ElementReader reader = new ElementReader();
178 // Read every page
179 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
180 reader.begin(itr.next());
181 DumpAllText(reader);
182 reader.end();
183 }
184
185 // Example 2. Extract text content based on the
186 // selection rectangle.
187 mOutputListener.print("\n----------------------------------------------------");
188 mOutputListener.print("\nExtract text based on the selection rectangle.");
189 mOutputListener.println("\n----------------------------------------------------");
190
191 Page first_page = doc.getPageIterator().next();
192 String s1 = ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
193 mOutputListener.print("\nField 1: " + s1);
194
195 s1 = ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
196 mOutputListener.print("\nField 2: " + s1);
197
198 s1 = ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
199 mOutputListener.print("\nField 3: " + s1);
200
201 // ...
202 mOutputListener.println("Done.");
203 } catch (Exception e) {
204 mOutputListener.printError(e.getStackTrace());
205 }
206 }
207
208 for (String file : mFileList) {
209 addToFileList(file);
210 }
211 printFooter(outputListener);
212 }
213
214
215 static void printStyle(TextExtractor.Style s) {
216 byte r = s.getColor()[0];
217 byte g = s.getColor()[1];
218 byte b = s.getColor()[2];
219 String rgb_hex = String.format("%02X%02X%02X;", r, g, b );
220 DecimalFormat df = new DecimalFormat("#.#");
221 mOutputListener.print(" style=\"font-family:" + s.getFontName() + "; "
222 + "font-size:" + df.format(s.getFontSize()) + ";"
223 + (s.isSerif() ? " sans-serif; " : " ")
224 + "color:#" + rgb_hex + "\"");
225 }
226
227 // A utility method used to dump all text content in the console window.
228 static void DumpAllText(ElementReader reader) throws PDFNetException {
229 Element element;
230 while ((element = reader.next()) != null) {
231 switch (element.getType()) {
232 case Element.e_text_begin:
233 mOutputListener.println("\n--> Text Block Begin");
234 break;
235 case Element.e_text_end:
236 mOutputListener.println("\n--> Text Block End");
237 break;
238 case Element.e_text: {
239 Rect bbox = element.getBBox();
240 if (bbox == null) continue;
241 mOutputListener.println("\n--> BBox: " + bbox.getX1() + ", "
242 + bbox.getY1() + ", "
243 + bbox.getX2() + ", "
244 + bbox.getY2());
245
246 String arr = element.getTextString();
247 mOutputListener.println(arr);
248 }
249 break;
250 case Element.e_text_new_line:
251 mOutputListener.println("\n--> New Line");
252 break;
253 case Element.e_form: // Process form XObjects
254 reader.formBegin();
255 DumpAllText(reader);
256 reader.end();
257 break;
258 }
259 }
260 }
261
262 // A helper method for ReadTextFromRect
263 static String RectTextSearch(ElementReader reader, Rect pos) throws PDFNetException {
264 Element element;
265 String srch_str = new String();
266 while ((element = reader.next()) != null) {
267 switch (element.getType()) {
268 case Element.e_text: {
269 Rect bbox = element.getBBox();
270 if (bbox == null) continue;
271 if (bbox.intersectRect(bbox, pos)) {
272 String arr = element.getTextString();
273 srch_str += arr;
274 srch_str += "\n"; // add a new line?
275 }
276 break;
277 }
278 case Element.e_text_new_line: {
279 break;
280 }
281 case Element.e_form: // Process form XObjects
282 {
283 reader.formBegin();
284 srch_str += RectTextSearch(reader, pos);
285 reader.end();
286 break;
287 }
288 }
289 }
290 return srch_str;
291 }
292
293 // A utility method used to extract all text content from
294 // a given selection rectangle. The rectangle coordinates are
295 // expressed in PDF user/page coordinate system.
296 static String ReadTextFromRect(Page page, Rect pos, ElementReader reader) throws PDFNetException {
297 reader.begin(page);
298 String srch_str = RectTextSearch(reader, pos);
299 reader.end();
300 return srch_str;
301 }
302
303}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales