PDF Data Extraction - Images, Text, Paths - Java Sample Code

Sample code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.sdf.*;
7import com.pdftron.pdf.*;
8import com.pdftron.common.*;
9import com.pdftron.filters.FilterReader;
10
11
12public class ElementReaderAdvTest {
13
14 static String m_buf;
15
16 static void ProcessPath(ElementReader reader, Element path) throws PDFNetException {
17 if (path.isClippingPath()) {
18 System.out.println("This is a clipping path");
19 }
20
21 PathData pathData = path.getPathData();
22 double[] data = pathData.getPoints();
23 byte[] opr = pathData.getOperators();
24
25 double x1, y1, x2, y2, x3, y3;
26 // Use path.getCTM() if you are interested in CTM (current transformation matrix).
27
28 System.out.print(" Path Data Points := \"");
29 int data_index = 0;
30 for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
31 switch (opr[opr_index]) {
32 case PathData.e_moveto:
33 x1 = data[data_index];
34 ++data_index;
35 y1 = data[data_index];
36 ++data_index;
37 System.out.print("M" + x1 + " " + y1);
38 break;
39 case PathData.e_lineto:
40 x1 = data[data_index];
41 ++data_index;
42 y1 = data[data_index];
43 ++data_index;
44 System.out.print(" L" + x1 + " " + y1);
45
46 break;
47 case PathData.e_cubicto:
48 x1 = data[data_index];
49 ++data_index;
50 y1 = data[data_index];
51 ++data_index;
52 x2 = data[data_index];
53 ++data_index;
54 y2 = data[data_index];
55 ++data_index;
56 x3 = data[data_index];
57 ++data_index;
58 y3 = data[data_index];
59 ++data_index;
60 System.out.print(" C" + x1 + " " + y1 + " " + x2 + " " + y2 + " " + x3 + " " + y3);
61 break;
62 case PathData.e_rect: {
63 x1 = data[data_index];
64 ++data_index;
65 y1 = data[data_index];
66 ++data_index;
67 double w = data[data_index];
68 ++data_index;
69 double h = data[data_index];
70 ++data_index;
71 x2 = x1 + w;
72 y2 = y1;
73 x3 = x2;
74 y3 = y1 + h;
75 double x4 = x1;
76 double y4 = y3;
77 System.out.print("M" + x1 + " " + y1 + " L" + x2 + " " + y2 + " L" + x3 + " " + y3 + " L" + x4 + " " + y4 + " Z");
78 }
79 break;
80 case PathData.e_closepath:
81 System.out.println(" Close Path");
82 break;
83 default:
84 throw new PDFNetException("Invalid Element Type", 0, "", "", "");
85 }
86 }
87
88 System.out.print("\" ");
89
90 GState gs = path.getGState();
91
92 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
93 if (path.isStroked()) {
94 System.out.println("Stroke path");
95
96 if (gs.getStrokeColorSpace().getType() == ColorSpace.e_pattern) {
97 System.out.println("Path has associated pattern");
98 } else {
99 // Get stroke color (you can use PDFNet color conversion facilities)
100 ColorPt rgb = new ColorPt();
101 rgb = gs.getStrokeColor();
102 double v = rgb.get(0);
103 rgb = gs.getStrokeColorSpace().convert2RGB(rgb);
104 v = rgb.get(0);
105 }
106 } else {
107 // Do not stroke path
108 }
109
110 if (path.isFilled()) {
111 System.out.println("Fill path");
112
113 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern) {
114 System.out.println("Path has associated pattern");
115 PatternColor pat = gs.getFillPattern();
116 int type = pat.getType();
117 if (type == PatternColor.e_shading) {
118 System.out.println("Shading");
119 Shading shading = pat.getShading();
120 if (shading.getType() == Shading.e_function_shading) {
121 System.out.println("FUNCT");
122 } else if (shading.getType() == Shading.e_axial_shading) {
123 System.out.println("AXIAL");
124 } else if (shading.getType() == Shading.e_radial_shading) {
125 System.out.println("RADIAL");
126 }
127 } else if (type == PatternColor.e_colored_tiling_pattern) {
128 System.out.println("e_colored_tiling_pattern");
129 } else if (type == PatternColor.e_uncolored_tiling_pattern) {
130 System.out.println("e_uncolored_tiling_pattern");
131 } else {
132 System.out.println("?");
133 }
134 } else {
135 ColorPt rgb = new ColorPt();
136 rgb = gs.getFillColor();
137 double v = rgb.get(0);
138 rgb = gs.getFillColorSpace().convert2RGB(rgb);
139 v = rgb.get(0);
140 }
141 } else {
142 // Do not fill path
143 }
144
145 // Process any changes in graphics state ---------------------------------
146
147 GSChangesIterator gs_itr = reader.getChangesIterator();
148 while (gs_itr.hasNext()) {
149 switch (gs_itr.next().intValue()) {
150 case GState.e_transform:
151 // Get transform matrix for this element. Unlike path.GetCTM()
152 // that return full transformation matrix gs.GetTransform() return
153 // only the transformation matrix that was installed for this element.
154 //
155 //gs.getTransform();
156 break;
157 case GState.e_line_width:
158 //gs.getLineWidth();
159 break;
160 case GState.e_line_cap:
161 //gs.getLineCap();
162 break;
163 case GState.e_line_join:
164 //gs.getLineJoin();
165 break;
166 case GState.e_flatness:
167 break;
168 case GState.e_miter_limit:
169 //gs.getMiterLimit();
170 break;
171 case GState.e_dash_pattern: {
172 //double[] dashes;
173 //dashes=gs.getDashes();
174 //gs.getPhase();
175 }
176 break;
177 case GState.e_fill_color: {
178 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern &&
179 gs.getFillPattern().getType() != PatternColor.e_shading) {
180 //process the pattern data
181 reader.patternBegin(true);
182 ProcessElements(reader);
183 reader.end();
184 }
185 }
186 break;
187 }
188 }
189 reader.clearChangeList();
190 }
191
192 static void ProcessText(ElementReader page_reader) throws PDFNetException {
193 // Begin text element
194 System.out.println("Begin Text Block:");
195
196 Element element;
197 while ((element = page_reader.next()) != null) {
198 switch (element.getType()) {
199 case Element.e_text_end:
200 // Finish the text block
201 System.out.println("End Text Block.");
202 return;
203
204 case Element.e_text: {
205 GState gs = element.getGState();
206
207 ColorSpace cs_fill = gs.getFillColorSpace();
208 ColorPt fill = gs.getFillColor();
209
210 ColorPt out;
211 out = cs_fill.convert2RGB(fill);
212
213
214 ColorSpace cs_stroke = gs.getStrokeColorSpace();
215 ColorPt stroke = gs.getStrokeColor();
216
217 Font font = gs.getFont();
218
219 System.out.println("Font Name: " + font.getName());
220 //font.isFixedWidth();
221 //font.isSerif();
222 //font.isSymbolic();
223 //font.isItalic();
224 // ...
225
226 //double font_size = gs.getFontSize();
227 //double word_spacing = gs.getWordSpacing();
228 //double char_spacing = gs.getCharSpacing();
229 //String txt = element.getTextString();
230
231 if (font.getType() == Font.e_Type3) {
232 //type 3 font, process its data
233 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
234 page_reader.type3FontBegin(itr.next(), null);
235 ProcessElements(page_reader);
236 page_reader.end();
237 }
238 } else {
239 Matrix2D text_mtx = element.getTextMatrix();
240 double x, y;
241 long char_code;
242
243 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
244 CharData data = itr.next();
245 char_code = data.getCharCode();
246 //System.out.print("Character code: ");
247
248 System.out.print(String.valueOf(char_code));
249
250 x = data.getGlyphX(); // character positioning information
251 y = data.getGlyphY();
252
253 // Use element.getCTM() if you are interested in the CTM
254 // (current transformation matrix).
255 Matrix2D ctm = element.getCTM();
256
257 // To get the exact character positioning information you need to
258 // concatenate current text matrix with CTM and then multiply
259 // relative positioning coordinates with the resulting matrix.
260 //
261 Matrix2D mtx = ctm.multiply(text_mtx);
262 java.awt.geom.Point2D.Double t = mtx.multPoint(x, y);
263 x = t.x;
264 y = t.y;
265 //System.out.println(" Position: x=" + x + " y=" + y );
266 }
267
268 System.out.println();
269 }
270 }
271 break;
272 }
273 }
274 }
275
276 static void ProcessImage(Element image) throws PDFNetException {
277 boolean image_mask = image.isImageMask();
278 boolean interpolate = image.isImageInterpolate();
279 int width = image.getImageWidth();
280 int height = image.getImageHeight();
281 int out_data_sz = width * height * 3;
282
283 System.out.println("Image: " +
284 " width=\"" + width + "\""
285 + " height=\"" + height);
286
287 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
288
289 // You can use GetImageData to read the raw (decoded) image data
290 //image->GetBitsPerComponent();
291 //image->GetImageData(); // get raw image data
292 // .... or use Image2RGB filter that converts every image to RGB format,
293 // This should save you time since you don't need to deal with color conversions,
294 // image up-sampling, decoding etc.
295
296 Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
297 FilterReader reader = new FilterReader(img_conv);
298
299 // A buffer used to keep image data.
300 byte[] buf = new byte[out_data_sz];
301 long image_data_out = reader.read(buf);
302 // &image_data_out.front() contains RGB image data.
303
304 // Note that you don't need to read a whole image at a time. Alternatively
305 // you can read a chunk at a time by repeatedly calling reader.Read(buf)
306 // until the function returns 0.
307 }
308
309 static void ProcessElements(ElementReader reader) throws PDFNetException {
310 Element element;
311 while ((element = reader.next()) != null) // Read page contents
312 {
313 switch (element.getType()) {
314 case Element.e_path: // Process path data...
315 {
316 ProcessPath(reader, element);
317 }
318 break;
319 case Element.e_text_begin: // Process text block...
320 {
321 ProcessText(reader);
322 }
323 break;
324 case Element.e_form: // Process form XObjects
325 {
326 reader.formBegin();
327 ProcessElements(reader);
328 reader.end();
329 }
330 break;
331 case Element.e_image: // Process Images
332 {
333 ProcessImage(element);
334 }
335 break;
336 }
337 }
338 }
339
340 public static void main(String[] args) {
341 PDFNet.initialize(PDFTronLicense.Key());
342
343 // Relative path to the folder containing test files.
344 String input_path = "../../TestFiles/";
345 // string output_path = "../../TestFiles/Output/";
346
347 System.out.println("__________________________________________________");
348 System.out.println("Extract page element information from all ");
349 System.out.println("pages in the document.");
350 try (PDFDoc doc = new PDFDoc((input_path + "newsletter.pdf"))) // Extract text data from all pages in the document
351 {
352 doc.initSecurityHandler();
353
354 int pgnum = doc.getPageCount();
355 PageIterator page_begin = doc.getPageIterator();
356
357 ElementReader page_reader = new ElementReader();
358
359 PageIterator itr;
360
361 for (itr = page_begin; itr.hasNext(); ) // Read every page
362 {
363 Page nextPage = itr.next();
364 System.out.println("Page " + nextPage.getIndex() +
365 "----------------------------------------");
366 page_reader.begin(nextPage);
367 ProcessElements(page_reader);
368 page_reader.end();
369 }
370 System.out.println("Done");
371 } catch (Exception e) {
372 System.out.println(e);
373 }
374
375 PDFNet.terminate();
376 }
377}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales