ElementReaderAdv

Sample Java code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our Android SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.Matrix2D;
13import com.pdftron.common.PDFNetException;
14import com.pdftron.filters.FilterReader;
15import com.pdftron.pdf.CharData;
16import com.pdftron.pdf.CharIterator;
17import com.pdftron.pdf.ColorPt;
18import com.pdftron.pdf.ColorSpace;
19import com.pdftron.pdf.Element;
20import com.pdftron.pdf.ElementReader;
21import com.pdftron.pdf.Font;
22import com.pdftron.pdf.GSChangesIterator;
23import com.pdftron.pdf.GState;
24import com.pdftron.pdf.Image2RGB;
25import com.pdftron.pdf.PDFDoc;
26import com.pdftron.pdf.Page;
27import com.pdftron.pdf.PageIterator;
28import com.pdftron.pdf.PathData;
29import com.pdftron.pdf.PatternColor;
30import com.pdftron.pdf.Shading;
31
32import java.util.ArrayList;
33
34public class ElementReaderAdvTest extends PDFNetSample {
35
36 private static OutputListener mOutputListener;
37
38 private static ArrayList<String> mFileList = new ArrayList<>();
39
40 public ElementReaderAdvTest() {
41 setTitle(R.string.sample_elementreaderadv_title);
42 setDescription(R.string.sample_elementreaderadv_description);
43 }
44
45 @Override
46 public void run(OutputListener outputListener) {
47 super.run(outputListener);
48 mOutputListener = outputListener;
49 mFileList.clear();
50 printHeader(outputListener);
51
52 // string output_path = "../../TestFiles/Output/";
53
54 mOutputListener.println("__________________________________________________");
55 mOutputListener.println("Extract page element information from all ");
56 mOutputListener.println("pages in the document.");
57 try (PDFDoc doc = new PDFDoc((Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath()))) // Extract text data from all pages in the document
58 {
59 doc.initSecurityHandler();
60
61 int pgnum = doc.getPageCount();
62 PageIterator page_begin = doc.getPageIterator();
63
64 ElementReader page_reader = new ElementReader();
65
66 PageIterator itr;
67
68 for (itr = page_begin; itr.hasNext(); ) // Read every page
69 {
70 Page nextPage = itr.next();
71 mOutputListener.println("Page " + nextPage.getIndex() +
72 "----------------------------------------");
73 page_reader.begin(nextPage);
74 ProcessElements(page_reader);
75 page_reader.end();
76 }
77 mOutputListener.println("Done");
78 } catch (Exception e) {
79 mOutputListener.printError(e.getStackTrace());
80 }
81
82 for (String file : mFileList) {
83 addToFileList(file);
84 }
85 printFooter(outputListener);
86 }
87
88 static String m_buf;
89
90 static void ProcessPath(ElementReader reader, Element path) throws PDFNetException {
91 if (path.isClippingPath()) {
92 mOutputListener.println("This is a clipping path");
93 }
94
95 PathData pathData = path.getPathData();
96 double[] data = pathData.getPoints();
97 byte[] opr = pathData.getOperators();
98
99 double x1, y1, x2, y2, x3, y3;
100 // Use path.getCTM() if you are interested in CTM (current transformation matrix).
101
102 mOutputListener.print(" Path Data Points := \"");
103 int data_index = 0;
104 for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
105 switch (opr[opr_index]) {
106 case PathData.e_moveto:
107 x1 = data[data_index];
108 ++data_index;
109 y1 = data[data_index];
110 ++data_index;
111 mOutputListener.print("M" + x1 + " " + y1);
112 break;
113 case PathData.e_lineto:
114 x1 = data[data_index];
115 ++data_index;
116 y1 = data[data_index];
117 ++data_index;
118 mOutputListener.print(" L" + x1 + " " + y1);
119
120 break;
121 case PathData.e_cubicto:
122 x1 = data[data_index];
123 ++data_index;
124 y1 = data[data_index];
125 ++data_index;
126 x2 = data[data_index];
127 ++data_index;
128 y2 = data[data_index];
129 ++data_index;
130 x3 = data[data_index];
131 ++data_index;
132 y3 = data[data_index];
133 ++data_index;
134 mOutputListener.print(" C" + x1 + " " + y1 + " " + x2 + " " + y2 + " " + x3 + " " + y3);
135 break;
136 case PathData.e_rect: {
137 x1 = data[data_index];
138 ++data_index;
139 y1 = data[data_index];
140 ++data_index;
141 double w = data[data_index];
142 ++data_index;
143 double h = data[data_index];
144 ++data_index;
145 x2 = x1 + w;
146 y2 = y1;
147 x3 = x2;
148 y3 = y1 + h;
149 double x4 = x1;
150 double y4 = y3;
151 mOutputListener.print("M" + x1 + " " + y1 + " L" + x2 + " " + y2 + " L" + x3 + " " + y3 + " L" + x4 + " " + y4 + " Z");
152 }
153 break;
154 case PathData.e_closepath:
155 mOutputListener.println(" Close Path");
156 break;
157 default:
158 throw new PDFNetException("Invalid Element Type", 0, "", "", "");
159 }
160 }
161
162 mOutputListener.print("\" ");
163
164 GState gs = path.getGState();
165
166 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
167 if (path.isStroked()) {
168 mOutputListener.println("Stroke path");
169
170 if (gs.getStrokeColorSpace().getType() == ColorSpace.e_pattern) {
171 mOutputListener.println("Path has associated pattern");
172 } else {
173 // Get stroke color (you can use PDFNet color conversion facilities)
174 ColorPt rgb = new ColorPt();
175 rgb = gs.getStrokeColor();
176 double v = rgb.get(0);
177 rgb = gs.getStrokeColorSpace().convert2RGB(rgb);
178 v = rgb.get(0);
179 }
180 } else {
181 // Do not stroke path
182 }
183
184 if (path.isFilled()) {
185 mOutputListener.println("Fill path");
186
187 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern) {
188 mOutputListener.println("Path has associated pattern");
189 PatternColor pat = gs.getFillPattern();
190 int type = pat.getType();
191 if (type == PatternColor.e_shading) {
192 mOutputListener.println("Shading");
193 Shading shading = pat.getShading();
194 if (shading.getType() == Shading.e_function_shading) {
195 mOutputListener.println("FUNCT");
196 } else if (shading.getType() == Shading.e_axial_shading) {
197 mOutputListener.println("AXIAL");
198 } else if (shading.getType() == Shading.e_radial_shading) {
199 mOutputListener.println("RADIAL");
200 }
201 } else if (type == PatternColor.e_colored_tiling_pattern) {
202 mOutputListener.println("e_colored_tiling_pattern");
203 } else if (type == PatternColor.e_uncolored_tiling_pattern) {
204 mOutputListener.println("e_uncolored_tiling_pattern");
205 } else {
206 mOutputListener.println("?");
207 }
208 } else {
209 ColorPt rgb = new ColorPt();
210 rgb = gs.getFillColor();
211 double v = rgb.get(0);
212 rgb = gs.getFillColorSpace().convert2RGB(rgb);
213 v = rgb.get(0);
214 }
215 } else {
216 // Do not fill path
217 }
218
219 // Process any changes in graphics state ---------------------------------
220
221 GSChangesIterator gs_itr = reader.getChangesIterator();
222 while (gs_itr.hasNext()) {
223 switch (gs_itr.next().intValue()) {
224 case GState.e_transform:
225 // Get transform matrix for this element. Unlike path.GetCTM()
226 // that return full transformation matrix gs.GetTransform() return
227 // only the transformation matrix that was installed for this element.
228 //
229 //gs.getTransform();
230 break;
231 case GState.e_line_width:
232 //gs.getLineWidth();
233 break;
234 case GState.e_line_cap:
235 //gs.getLineCap();
236 break;
237 case GState.e_line_join:
238 //gs.getLineJoin();
239 break;
240 case GState.e_flatness:
241 break;
242 case GState.e_miter_limit:
243 //gs.getMiterLimit();
244 break;
245 case GState.e_dash_pattern: {
246 //double[] dashes;
247 //dashes=gs.getDashes();
248 //gs.getPhase();
249 }
250 break;
251 case GState.e_fill_color: {
252 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern &&
253 gs.getFillPattern().getType() != PatternColor.e_shading) {
254 //process the pattern data
255 reader.patternBegin(true);
256 ProcessElements(reader);
257 reader.end();
258 }
259 }
260 break;
261 }
262 }
263 reader.clearChangeList();
264 }
265
266 static void ProcessText(ElementReader page_reader) throws PDFNetException {
267 // Begin text element
268 mOutputListener.println("Begin Text Block:");
269
270 Element element;
271 while ((element = page_reader.next()) != null) {
272 switch (element.getType()) {
273 case Element.e_text_end:
274 // Finish the text block
275 mOutputListener.println("End Text Block.");
276 return;
277
278 case Element.e_text: {
279 GState gs = element.getGState();
280
281 ColorSpace cs_fill = gs.getFillColorSpace();
282 ColorPt fill = gs.getFillColor();
283
284 ColorPt out;
285 out = cs_fill.convert2RGB(fill);
286
287 ColorSpace cs_stroke = gs.getStrokeColorSpace();
288 ColorPt stroke = gs.getStrokeColor();
289
290 Font font = gs.getFont();
291
292 mOutputListener.println("Font Name: " + font.getName());
293 //font.isFixedWidth();
294 //font.isSerif();
295 //font.isSymbolic();
296 //font.isItalic();
297 // ...
298
299 //double font_size = gs.getFontSize();
300 //double word_spacing = gs.getWordSpacing();
301 //double char_spacing = gs.getCharSpacing();
302 //String txt = element.getTextString();
303
304 if (font.getType() == Font.e_Type3) {
305 //type 3 font, process its data
306 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
307 page_reader.type3FontBegin(itr.next(), null);
308 ProcessElements(page_reader);
309 page_reader.end();
310 }
311 } else {
312 Matrix2D text_mtx = element.getTextMatrix();
313 double x, y;
314 long char_code;
315
316 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
317 CharData data = itr.next();
318 char_code = data.getCharCode();
319 //mOutputListener.print("Character code: ");
320
321 mOutputListener.print(String.valueOf(char_code));
322
323 x = data.getGlyphX(); // character positioning information
324 y = data.getGlyphY();
325
326 // Use element.getCTM() if you are interested in the CTM
327 // (current transformation matrix).
328 Matrix2D ctm = element.getCTM();
329
330 // To get the exact character positioning information you need to
331 // concatenate current text matrix with CTM and then multiply
332 // relative positioning coordinates with the resulting matrix.
333 //
334 Matrix2D mtx = ctm.multiply(text_mtx);
335 com.pdftron.pdf.Point t = mtx.multPoint(x, y);
336 x = t.x;
337 y = t.y;
338 //mOutputListener.println(" Position: x=" + x + " y=" + y );
339 }
340
341 mOutputListener.println();
342 }
343 }
344 break;
345 }
346 }
347 }
348
349 static void ProcessImage(Element image) throws PDFNetException {
350 boolean image_mask = image.isImageMask();
351 boolean interpolate = image.isImageInterpolate();
352 int width = image.getImageWidth();
353 int height = image.getImageHeight();
354 int out_data_sz = width * height * 3;
355
356 mOutputListener.println("Image: " +
357 " width=\"" + width + "\""
358 + " height=\"" + height);
359
360 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
361
362 // You can use GetImageData to read the raw (decoded) image data
363 //image->GetBitsPerComponent();
364 //image->GetImageData(); // get raw image data
365 // .... or use Image2RGB filter that converts every image to RGB format,
366 // This should save you time since you don't need to deal with color conversions,
367 // image up-sampling, decoding etc.
368
369 Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
370 FilterReader reader = new FilterReader(img_conv);
371
372 // A buffer used to keep image data.
373 byte[] buf = new byte[out_data_sz];
374 long image_data_out = reader.read(buf);
375 // &image_data_out.front() contains RGB image data.
376
377 // Note that you don't need to read a whole image at a time. Alternatively
378 // you can read a chunk at a time by repeatedly calling reader.Read(buf)
379 // until the function returns 0.
380 }
381
382 static void ProcessElements(ElementReader reader) throws PDFNetException {
383 Element element;
384 while ((element = reader.next()) != null) // Read page contents
385 {
386 switch (element.getType()) {
387 case Element.e_path: // Process path data...
388 {
389 ProcessPath(reader, element);
390 }
391 break;
392 case Element.e_text_begin: // Process text block...
393 {
394 ProcessText(reader);
395 }
396 break;
397 case Element.e_form: // Process form XObjects
398 {
399 reader.formBegin();
400 ProcessElements(reader);
401 reader.end();
402 }
403 break;
404 case Element.e_image: // Process Images
405 {
406 ProcessImage(element);
407 }
408 break;
409 }
410 }
411 }
412
413}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales