Extract Text, Read, Parse PDF - TextExtract - Java Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.awt.Color;
7import com.pdftron.common.PDFNetException;
8import com.pdftron.pdf.*;
9import java.text.DecimalFormat;
10
11
12// This sample illustrates the basic text extraction capabilities of PDFNet.
13public class TextExtractTest {
14
15 public static void main(String[] args) {
16 PDFNet.initialize(PDFTronLicense.Key());
17
18 // Relative path to the folder containing test files.
19 String input_path = "../../TestFiles/";
20 // string output_path = "../../TestFiles/Output/";
21 boolean example1_basic = false;
22 boolean example2_xml = false;
23 boolean example3_wordlist = false;
24 boolean example4_advanced = true;
25 boolean example5_low_level = false;
26
27 // Sample code showing how to use high-level text extraction APIs.
28 try (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) {
29 doc.initSecurityHandler();
30
31 Page page = doc.getPage(1);
32 if (page == null) {
33 System.out.println("Page not found.");
34 }
35
36 TextExtractor txt = new TextExtractor();
37 txt.begin(page); // Read the page.
38 // Other options you may want to consider...
39 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
40 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
41 // ...
42
43 // Example 1. Get all text on the page in a single string.
44 // Words will be separated with space or new line characters.
45 if (example1_basic) {
46 // Get the word count.
47 System.out.println("Word Count: " + txt.getWordCount());
48
49 System.out.println("\n\n- GetAsText --------------------------\n" + txt.getAsText());
50 System.out.println("-----------------------------------------------------------");
51 }
52
53 // Example 2. Get XML logical structure for the page.
54 if (example2_xml) {
55 String text = txt.getAsXML(TextExtractor.e_words_as_elements | TextExtractor.e_output_bbox | TextExtractor.e_output_style_info);
56 System.out.println("\n\n- GetAsXML --------------------------\n" + text);
57 System.out.println("-----------------------------------------------------------");
58 }
59
60 // Example 3. Extract words one by one.
61 if (example3_wordlist) {
62 TextExtractor.Word word;
63 for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
64 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
65 System.out.println(word.getString());
66 }
67 }
68 System.out.println("-----------------------------------------------------------");
69 }
70
71 // Example 4. A more advanced text extraction example.
72 // The output is XML structure containing paragraphs, lines, words,
73 // as well as style and positioning information.
74 if (example4_advanced) {
75 Rect bbox;
76 int cur_flow_id = -1, cur_para_id = -1;
77
78 TextExtractor.Line line;
79 TextExtractor.Word word;
80 TextExtractor.Style s, line_style;
81
82 System.out.println("<PDFText>");
83 // For each line on the page...
84 for (line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
85 if (line.getNumWords() == 0)
86 continue;
87 if (cur_flow_id != line.getFlowID()) {
88 if (cur_flow_id != -1) {
89 if (cur_para_id != -1) {
90 cur_para_id = -1;
91 System.out.println("</Para>");
92 }
93 System.out.println("</Flow>");
94 }
95 cur_flow_id = line.getFlowID();
96 System.out.println("<Flow id=\"" + cur_flow_id + "\">");
97 }
98
99 if (cur_para_id != line.getParagraphID()) {
100 if (cur_para_id != -1)
101 System.out.println("</Para>");
102 cur_para_id = line.getParagraphID();
103 System.out.println("<Para id=\"" + cur_para_id + "\">");
104 }
105
106 bbox = line.getBBox();
107 line_style = line.getStyle();
108 System.out.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
109 printStyle(line_style);
110 System.out.println(" cur_num=\"" + line.getCurrentNum() + "\">");
111
112
113 // For each word in the line...
114 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
115 // Output the bounding box for the word.
116 bbox = word.getBBox();
117 System.out.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
118 System.out.print(" cur_num=\"" + word.getCurrentNum() + "\"");
119 int sz = word.getStringLen();
120 if (sz == 0) continue;
121
122 // If the word style is different from the parent style, output the new style.
123 s = word.getStyle();
124 if (!s.equals(line_style)) {
125 printStyle(s);
126 }
127
128 System.out.print(">" + word.getString());
129 System.out.println("</Word>");
130 }
131 System.out.println("</Line>");
132 }
133
134 if (cur_flow_id != -1) {
135 if (cur_para_id != -1) {
136 cur_para_id = -1;
137 System.out.println("</Para>");
138 }
139 System.out.println("</Flow>");
140 }
141 }
142 txt.destroy();
143 System.out.println("</PDFText>");
144 } catch (PDFNetException e) {
145 System.out.println(e);
146 }
147
148 // Sample code showing how to use low-level text extraction APIs.
149 if (example5_low_level) {
150 try (PDFDoc doc = new PDFDoc((input_path + "newsletter.pdf"))) {
151 doc.initSecurityHandler();
152
153 // Example 1. Extract all text content from the document
154
155 ElementReader reader = new ElementReader();
156 // Read every page
157 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
158 reader.begin(itr.next());
159 DumpAllText(reader);
160 reader.end();
161 }
162
163 // Example 2. Extract text content based on the
164 // selection rectangle.
165 System.out.print("\n----------------------------------------------------");
166 System.out.print("\nExtract text based on the selection rectangle.");
167 System.out.println("\n----------------------------------------------------");
168
169 Page first_page = doc.getPageIterator().next();
170 String s1 = ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
171 System.out.print("\nField 1: " + s1);
172
173 s1 = ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
174 System.out.print("\nField 2: " + s1);
175
176 s1 = ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
177 System.out.print("\nField 3: " + s1);
178
179 // ...
180 System.out.println("Done.");
181 } catch (Exception e) {
182 e.printStackTrace();
183 }
184 }
185
186 PDFNet.terminate();
187 }
188
189
190 static void printStyle(TextExtractor.Style s) {
191 Color rgb = s.getColor();
192 String rgb_hex = String.format("%02X%02X%02X;", rgb.getRed(), rgb.getGreen(), rgb.getBlue() );
193 DecimalFormat df = new DecimalFormat("#.#");
194 System.out.print(" style=\"font-family:" + s.getFontName() + "; "
195 + "font-size:" + df.format(s.getFontSize()) + ";"
196 + (s.isSerif() ? " sans-serif; " : " ")
197 + "color:#" + rgb_hex + "\"");
198 }
199
200 // A utility method used to dump all text content in the console window.
201 static void DumpAllText(ElementReader reader) throws PDFNetException {
202 Element element;
203 while ((element = reader.next()) != null) {
204 switch (element.getType()) {
205 case Element.e_text_begin:
206 System.out.println("\n--> Text Block Begin");
207 break;
208 case Element.e_text_end:
209 System.out.println("\n--> Text Block End");
210 break;
211 case Element.e_text: {
212 Rect bbox = element.getBBox();
213 if (bbox == null) continue;
214 System.out.println("\n--> BBox: " + bbox.getX1() + ", "
215 + bbox.getY1() + ", "
216 + bbox.getX2() + ", "
217 + bbox.getY2());
218
219 String arr = element.getTextString();
220 System.out.println(arr);
221 }
222 break;
223 case Element.e_text_new_line:
224 System.out.println("\n--> New Line");
225 break;
226 case Element.e_form: // Process form XObjects
227 reader.formBegin();
228 DumpAllText(reader);
229 reader.end();
230 break;
231 }
232 }
233 }
234
235 // A helper method for ReadTextFromRect
236 static String RectTextSearch(ElementReader reader, Rect pos) throws PDFNetException {
237 Element element;
238 String srch_str = new String();
239 while ((element = reader.next()) != null) {
240 switch (element.getType()) {
241 case Element.e_text: {
242 Rect bbox = element.getBBox();
243 if (bbox == null) continue;
244 if (bbox.intersectRect(bbox, pos)) {
245 String arr = element.getTextString();
246 srch_str += arr;
247 srch_str += "\n"; // add a new line?
248 }
249 break;
250 }
251 case Element.e_text_new_line: {
252 break;
253 }
254 case Element.e_form: // Process form XObjects
255 {
256 reader.formBegin();
257 srch_str += RectTextSearch(reader, pos);
258 reader.end();
259 break;
260 }
261 }
262 }
263 return srch_str;
264 }
265
266 // A utility method used to extract all text content from
267 // a given selection rectangle. The rectangle coordinates are
268 // expressed in PDF user/page coordinate system.
269 static String ReadTextFromRect(Page page, Rect pos, ElementReader reader) throws PDFNetException {
270 reader.begin(page);
271 String srch_str = RectTextSearch(reader, pos);
272 reader.end();
273 return srch_str;
274 }
275}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales