Extract Text, Read, Parse PDF - TextExtract - Java Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.awt.Color;
7import com.pdftron.common.PDFNetException;
8import com.pdftron.pdf.*;
9import java.text.DecimalFormat;
10
11
12// This sample illustrates the basic text extraction capabilities of PDFNet.
13public class TextExtractTest {
14
15 public static void main(String[] args) {
16 PDFNet.initialize(PDFTronLicense.Key());
17
18 // Relative path to the folder containing test files.
19 String input_path = "../../TestFiles/";
20 // string output_path = "../../TestFiles/Output/";
21 boolean example1_basic = false;
22 boolean example2_xml = false;
23 boolean example3_wordlist = false;
24 boolean example4_advanced = true;
25 boolean example5_low_level = false;
26
27 // Sample code showing how to use high-level text extraction APIs.
28 try (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) {
29 doc.initSecurityHandler();
30
31 Page page = doc.getPage(1);
32 if (page == null) {
33 System.out.println("Page not found.");
34 }
35
36 TextExtractor txt = new TextExtractor();
37 txt.begin(page); // Read the page.
38 // Other options you may want to consider...
39 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
40 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
41 // ...
42
43 // Example 1. Get all text on the page in a single string.
44 // Words will be separated with space or new line characters.
45 if (example1_basic) {
46 // Get the word count.
47 System.out.println("Word Count: " + txt.getWordCount());
48
49 System.out.println("\n\n- GetAsText --------------------------\n" + txt.getAsText());
50 System.out.println("-----------------------------------------------------------");
51 }
52
53 // Example 2. Get XML logical structure for the page.
54 if (example2_xml) {
55 String text = txt.getAsXML(TextExtractor.e_words_as_elements | TextExtractor.e_output_bbox | TextExtractor.e_output_style_info);
56 System.out.println("\n\n- GetAsXML --------------------------\n" + text);
57 System.out.println("-----------------------------------------------------------");
58 }
59
60 // Example 3. Extract words one by one.
61 if (example3_wordlist) {
62 TextExtractor.Word word;
63 for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
64 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
65 System.out.println(word.getString());
66 }
67 }
68 System.out.println("-----------------------------------------------------------");
69 }
70
71 // Example 4. A more advanced text extraction example.
72 // The output is XML structure containing paragraphs, lines, words,
73 // as well as style and positioning information.
74 if (example4_advanced) {
75 Rect bbox;
76 int cur_flow_id = -1, cur_para_id = -1;
77
78 TextExtractor.Line line;
79 TextExtractor.Word word;
80 TextExtractor.Style s, line_style;
81
82 System.out.println("<PDFText>");
83 // For each line on the page...
84 for (line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
85 if (line.getNumWords() == 0)
86 continue;
87 if (cur_flow_id != line.getFlowID()) {
88 if (cur_flow_id != -1) {
89 if (cur_para_id != -1) {
90 cur_para_id = -1;
91 System.out.println("</Para>");
92 }
93 System.out.println("</Flow>");
94 }
95 cur_flow_id = line.getFlowID();
96 System.out.println("<Flow id=\"" + cur_flow_id + "\">");
97 }
98
99 if (cur_para_id != line.getParagraphID()) {
100 if (cur_para_id != -1)
101 System.out.println("</Para>");
102 cur_para_id = line.getParagraphID();
103 System.out.println("<Para id=\"" + cur_para_id + "\">");
104 }
105
106 bbox = line.getBBox();
107 line_style = line.getStyle();
108 System.out.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
109 printStyle(line_style);
110 System.out.println(" cur_num=\"" + line.getCurrentNum() + "\">");
111
112
113 // For each word in the line...
114 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
115 // Output the bounding box for the word.
116 bbox = word.getBBox();
117 System.out.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
118 System.out.print(" cur_num=\"" + word.getCurrentNum() + "\"");
119 int sz = word.getStringLen();
120 if (sz == 0) continue;
121
122 // If the word style is different from the parent style, output the new style.
123 s = word.getStyle();
124 if (!s.equals(line_style)) {
125 printStyle(s);
126 }
127
128 System.out.print(">" + word.getString());
129 System.out.println("</Word>");
130 }
131 System.out.println("</Line>");
132 }
133
134 if (cur_flow_id != -1) {
135 if (cur_para_id != -1) {
136 cur_para_id = -1;
137 System.out.println("</Para>");
138 }
139 System.out.println("</Flow>");
140 }
141 }
142 txt.destroy();
143 System.out.println("</PDFText>");
144 } catch (PDFNetException e) {
145 System.out.println(e);
146 }
147
148 // Sample code showing how to use low-level text extraction APIs.
149 if (example5_low_level) {
150 try (PDFDoc doc = new PDFDoc((input_path + "newsletter.pdf"))) {
151 doc.initSecurityHandler();
152
153 // Example 1. Extract all text content from the document
154
155 ElementReader reader = new ElementReader();
156 // Read every page
157 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
158 reader.begin(itr.next());
159 DumpAllText(reader);
160 reader.end();
161 }
162
163 // Example 2. Extract text content based on the
164 // selection rectangle.
165 System.out.print("\n----------------------------------------------------");
166 System.out.print("\nExtract text based on the selection rectangle.");
167 System.out.println("\n----------------------------------------------------");
168
169 Page first_page = doc.getPageIterator().next();
170 String s1 = ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
171 System.out.print("\nField 1: " + s1);
172
173 s1 = ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
174 System.out.print("\nField 2: " + s1);
175
176 s1 = ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
177 System.out.print("\nField 3: " + s1);
178
179 // ...
180 System.out.println("Done.");
181 } catch (Exception e) {
182 e.printStackTrace();
183 }
184 }
185
186 PDFNet.terminate();
187 }
188
189
190 static void printStyle(TextExtractor.Style s) {
191 Color rgb = s.getColor();
192 String rgb_hex = String.format("%02X%02X%02X;", rgb.getRed(), rgb.getGreen(), rgb.getBlue() );
193 DecimalFormat df = new DecimalFormat("#.#");
194 System.out.print(" style=\"font-family:" + s.getFontName() + "; "
195 + "font-size:" + df.format(s.getFontSize()) + ";"
196 + (s.isSerif() ? " sans-serif; " : " ")
197 + "color:#" + rgb_hex + "\"");
198 }
199
200 // A utility method used to dump all text content in the console window.
201 static void DumpAllText(ElementReader reader) throws PDFNetException {
202 Element element;
203 while ((element = reader.next()) != null) {
204 switch (element.getType()) {
205 case Element.e_text_begin:
206 System.out.println("\n--> Text Block Begin");
207 break;
208 case Element.e_text_end:
209 System.out.println("\n--> Text Block End");
210 break;
211 case Element.e_text: {
212 Rect bbox = element.getBBox();
213 if (bbox == null) continue;
214 System.out.println("\n--> BBox: " + bbox.getX1() + ", "
215 + bbox.getY1() + ", "
216 + bbox.getX2() + ", "
217 + bbox.getY2());
218
219 String arr = element.getTextString();
220 System.out.println(arr);
221 }
222 break;
223 case Element.e_text_new_line:
224 System.out.println("\n--> New Line");
225 break;
226 case Element.e_form: // Process form XObjects
227 reader.formBegin();
228 DumpAllText(reader);
229 reader.end();
230 break;
231 }
232 }
233 }
234
235 // A helper method for ReadTextFromRect
236 static String RectTextSearch(ElementReader reader, Rect pos) throws PDFNetException {
237 Element element;
238 String srch_str = new String();
239 while ((element = reader.next()) != null) {
240 switch (element.getType()) {
241 case Element.e_text: {
242 Rect bbox = element.getBBox();
243 if (bbox == null) continue;
244 if (bbox.intersectRect(bbox, pos)) {
245 String arr = element.getTextString();
246 srch_str += arr;
247 srch_str += "\n"; // add a new line?
248 }
249 break;
250 }
251 case Element.e_text_new_line: {
252 break;
253 }
254 case Element.e_form: // Process form XObjects
255 {
256 reader.formBegin();
257 srch_str += RectTextSearch(reader, pos);
258 reader.end();
259 break;
260 }
261 }
262 }
263 return srch_str;
264 }
265
266 // A utility method used to extract all text content from
267 // a given selection rectangle. The rectangle coordinates are
268 // expressed in PDF user/page coordinate system.
269 static String ReadTextFromRect(Page page, Rect pos, ElementReader reader) throws PDFNetException {
270 reader.begin(page);
271 String srch_str = RectTextSearch(reader, pos);
272 reader.end();
273 return srch_str;
274 }
275}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales