ElementReaderAdv

Sample Java, Kotlin code for using Apryse Android SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get started with Android SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2019 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6package com.pdftron.android.pdfnetsdksamples.samples;
7
8import com.pdftron.android.pdfnetsdksamples.OutputListener;
9import com.pdftron.android.pdfnetsdksamples.PDFNetSample;
10import com.pdftron.android.pdfnetsdksamples.R;
11import com.pdftron.android.pdfnetsdksamples.util.Utils;
12import com.pdftron.common.Matrix2D;
13import com.pdftron.common.PDFNetException;
14import com.pdftron.filters.FilterReader;
15import com.pdftron.pdf.CharData;
16import com.pdftron.pdf.CharIterator;
17import com.pdftron.pdf.ColorPt;
18import com.pdftron.pdf.ColorSpace;
19import com.pdftron.pdf.Element;
20import com.pdftron.pdf.ElementReader;
21import com.pdftron.pdf.Font;
22import com.pdftron.pdf.GSChangesIterator;
23import com.pdftron.pdf.GState;
24import com.pdftron.pdf.Image2RGB;
25import com.pdftron.pdf.PDFDoc;
26import com.pdftron.pdf.Page;
27import com.pdftron.pdf.PageIterator;
28import com.pdftron.pdf.PathData;
29import com.pdftron.pdf.PatternColor;
30import com.pdftron.pdf.Shading;
31
32import java.util.ArrayList;
33
34public class ElementReaderAdvTest extends PDFNetSample {
35
36 private static OutputListener mOutputListener;
37
38 private static ArrayList<String> mFileList = new ArrayList<>();
39
40 public ElementReaderAdvTest() {
41 setTitle(R.string.sample_elementreaderadv_title);
42 setDescription(R.string.sample_elementreaderadv_description);
43 }
44
45 @Override
46 public void run(OutputListener outputListener) {
47 super.run(outputListener);
48 mOutputListener = outputListener;
49 mFileList.clear();
50 printHeader(outputListener);
51
52 // string output_path = "../../TestFiles/Output/";
53
54 mOutputListener.println("__________________________________________________");
55 mOutputListener.println("Extract page element information from all ");
56 mOutputListener.println("pages in the document.");
57 try (PDFDoc doc = new PDFDoc((Utils.getAssetTempFile(INPUT_PATH + "newsletter.pdf").getAbsolutePath()))) // Extract text data from all pages in the document
58 {
59 doc.initSecurityHandler();
60
61 int pgnum = doc.getPageCount();
62 PageIterator page_begin = doc.getPageIterator();
63
64 ElementReader page_reader = new ElementReader();
65
66 PageIterator itr;
67
68 for (itr = page_begin; itr.hasNext(); ) // Read every page
69 {
70 Page nextPage = itr.next();
71 mOutputListener.println("Page " + nextPage.getIndex() +
72 "----------------------------------------");
73 page_reader.begin(nextPage);
74 ProcessElements(page_reader);
75 page_reader.end();
76 }
77 mOutputListener.println("Done");
78 } catch (Exception e) {
79 mOutputListener.printError(e.getStackTrace());
80 }
81
82 for (String file : mFileList) {
83 addToFileList(file);
84 }
85 printFooter(outputListener);
86 }
87
88 static String m_buf;
89
90 static void ProcessPath(ElementReader reader, Element path) throws PDFNetException {
91 if (path.isClippingPath()) {
92 mOutputListener.println("This is a clipping path");
93 }
94
95 PathData pathData = path.getPathData();
96 double[] data = pathData.getPoints();
97 byte[] opr = pathData.getOperators();
98
99 double x1, y1, x2, y2, x3, y3;
100 // Use path.getCTM() if you are interested in CTM (current transformation matrix).
101
102 mOutputListener.print(" Path Data Points := \"");
103 int data_index = 0;
104 for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
105 switch (opr[opr_index]) {
106 case PathData.e_moveto:
107 x1 = data[data_index];
108 ++data_index;
109 y1 = data[data_index];
110 ++data_index;
111 mOutputListener.print("M" + x1 + " " + y1);
112 break;
113 case PathData.e_lineto:
114 x1 = data[data_index];
115 ++data_index;
116 y1 = data[data_index];
117 ++data_index;
118 mOutputListener.print(" L" + x1 + " " + y1);
119
120 break;
121 case PathData.e_cubicto:
122 x1 = data[data_index];
123 ++data_index;
124 y1 = data[data_index];
125 ++data_index;
126 x2 = data[data_index];
127 ++data_index;
128 y2 = data[data_index];
129 ++data_index;
130 x3 = data[data_index];
131 ++data_index;
132 y3 = data[data_index];
133 ++data_index;
134 mOutputListener.print(" C" + x1 + " " + y1 + " " + x2 + " " + y2 + " " + x3 + " " + y3);
135 break;
136 case PathData.e_rect: {
137 x1 = data[data_index];
138 ++data_index;
139 y1 = data[data_index];
140 ++data_index;
141 double w = data[data_index];
142 ++data_index;
143 double h = data[data_index];
144 ++data_index;
145 x2 = x1 + w;
146 y2 = y1;
147 x3 = x2;
148 y3 = y1 + h;
149 double x4 = x1;
150 double y4 = y3;
151 mOutputListener.print("M" + x1 + " " + y1 + " L" + x2 + " " + y2 + " L" + x3 + " " + y3 + " L" + x4 + " " + y4 + " Z");
152 }
153 break;
154 case PathData.e_closepath:
155 mOutputListener.println(" Close Path");
156 break;
157 default:
158 throw new PDFNetException("Invalid Element Type", 0, "", "", "");
159 }
160 }
161
162 mOutputListener.print("\" ");
163
164 GState gs = path.getGState();
165
166 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
167 if (path.isStroked()) {
168 mOutputListener.println("Stroke path");
169
170 if (gs.getStrokeColorSpace().getType() == ColorSpace.e_pattern) {
171 mOutputListener.println("Path has associated pattern");
172 } else {
173 // Get stroke color (you can use PDFNet color conversion facilities)
174 ColorPt rgb = new ColorPt();
175 rgb = gs.getStrokeColor();
176 double v = rgb.get(0);
177 rgb = gs.getStrokeColorSpace().convert2RGB(rgb);
178 v = rgb.get(0);
179 }
180 } else {
181 // Do not stroke path
182 }
183
184 if (path.isFilled()) {
185 mOutputListener.println("Fill path");
186
187 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern) {
188 mOutputListener.println("Path has associated pattern");
189 PatternColor pat = gs.getFillPattern();
190 int type = pat.getType();
191 if (type == PatternColor.e_shading) {
192 mOutputListener.println("Shading");
193 Shading shading = pat.getShading();
194 if (shading.getType() == Shading.e_function_shading) {
195 mOutputListener.println("FUNCT");
196 } else if (shading.getType() == Shading.e_axial_shading) {
197 mOutputListener.println("AXIAL");
198 } else if (shading.getType() == Shading.e_radial_shading) {
199 mOutputListener.println("RADIAL");
200 }
201 } else if (type == PatternColor.e_colored_tiling_pattern) {
202 mOutputListener.println("e_colored_tiling_pattern");
203 } else if (type == PatternColor.e_uncolored_tiling_pattern) {
204 mOutputListener.println("e_uncolored_tiling_pattern");
205 } else {
206 mOutputListener.println("?");
207 }
208 } else {
209 ColorPt rgb = new ColorPt();
210 rgb = gs.getFillColor();
211 double v = rgb.get(0);
212 rgb = gs.getFillColorSpace().convert2RGB(rgb);
213 v = rgb.get(0);
214 }
215 } else {
216 // Do not fill path
217 }
218
219 // Process any changes in graphics state ---------------------------------
220
221 GSChangesIterator gs_itr = reader.getChangesIterator();
222 while (gs_itr.hasNext()) {
223 switch (gs_itr.next().intValue()) {
224 case GState.e_transform:
225 // Get transform matrix for this element. Unlike path.GetCTM()
226 // that return full transformation matrix gs.GetTransform() return
227 // only the transformation matrix that was installed for this element.
228 //
229 //gs.getTransform();
230 break;
231 case GState.e_line_width:
232 //gs.getLineWidth();
233 break;
234 case GState.e_line_cap:
235 //gs.getLineCap();
236 break;
237 case GState.e_line_join:
238 //gs.getLineJoin();
239 break;
240 case GState.e_flatness:
241 break;
242 case GState.e_miter_limit:
243 //gs.getMiterLimit();
244 break;
245 case GState.e_dash_pattern: {
246 //double[] dashes;
247 //dashes=gs.getDashes();
248 //gs.getPhase();
249 }
250 break;
251 case GState.e_fill_color: {
252 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern &&
253 gs.getFillPattern().getType() != PatternColor.e_shading) {
254 //process the pattern data
255 reader.patternBegin(true);
256 ProcessElements(reader);
257 reader.end();
258 }
259 }
260 break;
261 }
262 }
263 reader.clearChangeList();
264 }
265
266 static void ProcessText(ElementReader page_reader) throws PDFNetException {
267 // Begin text element
268 mOutputListener.println("Begin Text Block:");
269
270 Element element;
271 while ((element = page_reader.next()) != null) {
272 switch (element.getType()) {
273 case Element.e_text_end:
274 // Finish the text block
275 mOutputListener.println("End Text Block.");
276 return;
277
278 case Element.e_text: {
279 GState gs = element.getGState();
280
281 ColorSpace cs_fill = gs.getFillColorSpace();
282 ColorPt fill = gs.getFillColor();
283
284 ColorPt out;
285 out = cs_fill.convert2RGB(fill);
286
287 ColorSpace cs_stroke = gs.getStrokeColorSpace();
288 ColorPt stroke = gs.getStrokeColor();
289
290 Font font = gs.getFont();
291
292 mOutputListener.println("Font Name: " + font.getName());
293 //font.isFixedWidth();
294 //font.isSerif();
295 //font.isSymbolic();
296 //font.isItalic();
297 // ...
298
299 //double font_size = gs.getFontSize();
300 //double word_spacing = gs.getWordSpacing();
301 //double char_spacing = gs.getCharSpacing();
302 //String txt = element.getTextString();
303
304 if (font.getType() == Font.e_Type3) {
305 //type 3 font, process its data
306 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
307 page_reader.type3FontBegin(itr.next(), null);
308 ProcessElements(page_reader);
309 page_reader.end();
310 }
311 } else {
312 Matrix2D text_mtx = element.getTextMatrix();
313 double x, y;
314 long char_code;
315
316 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
317 CharData data = itr.next();
318 char_code = data.getCharCode();
319 //mOutputListener.print("Character code: ");
320
321 mOutputListener.print(String.valueOf(char_code));
322
323 x = data.getGlyphX(); // character positioning information
324 y = data.getGlyphY();
325
326 // Use element.getCTM() if you are interested in the CTM
327 // (current transformation matrix).
328 Matrix2D ctm = element.getCTM();
329
330 // To get the exact character positioning information you need to
331 // concatenate current text matrix with CTM and then multiply
332 // relative positioning coordinates with the resulting matrix.
333 //
334 Matrix2D mtx = ctm.multiply(text_mtx);
335 com.pdftron.pdf.Point t = mtx.multPoint(x, y);
336 x = t.x;
337 y = t.y;
338 //mOutputListener.println(" Position: x=" + x + " y=" + y );
339 }
340
341 mOutputListener.println();
342 }
343 }
344 break;
345 }
346 }
347 }
348
349 static void ProcessImage(Element image) throws PDFNetException {
350 boolean image_mask = image.isImageMask();
351 boolean interpolate = image.isImageInterpolate();
352 int width = image.getImageWidth();
353 int height = image.getImageHeight();
354 int out_data_sz = width * height * 3;
355
356 mOutputListener.println("Image: " +
357 " width=\"" + width + "\""
358 + " height=\"" + height);
359
360 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
361
362 // You can use GetImageData to read the raw (decoded) image data
363 //image->GetBitsPerComponent();
364 //image->GetImageData(); // get raw image data
365 // .... or use Image2RGB filter that converts every image to RGB format,
366 // This should save you time since you don't need to deal with color conversions,
367 // image up-sampling, decoding etc.
368
369 Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
370 FilterReader reader = new FilterReader(img_conv);
371
372 // A buffer used to keep image data.
373 byte[] buf = new byte[out_data_sz];
374 long image_data_out = reader.read(buf);
375 // &image_data_out.front() contains RGB image data.
376
377 // Note that you don't need to read a whole image at a time. Alternatively
378 // you can read a chunk at a time by repeatedly calling reader.Read(buf)
379 // until the function returns 0.
380 }
381
382 static void ProcessElements(ElementReader reader) throws PDFNetException {
383 Element element;
384 while ((element = reader.next()) != null) // Read page contents
385 {
386 switch (element.getType()) {
387 case Element.e_path: // Process path data...
388 {
389 ProcessPath(reader, element);
390 }
391 break;
392 case Element.e_text_begin: // Process text block...
393 {
394 ProcessText(reader);
395 }
396 break;
397 case Element.e_form: // Process form XObjects
398 {
399 reader.formBegin();
400 ProcessElements(reader);
401 reader.end();
402 }
403 break;
404 case Element.e_image: // Process Images
405 {
406 ProcessImage(element);
407 }
408 break;
409 }
410 }
411 }
412
413}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales