ElementReaderAdv

Sample code for using Apryse UWP SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state.
Learn more about our full PDF Data Extraction SDK Capabilities.
To start your free trial, get stated with UWP SDK.
1//
2// Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.IO;
7using System.Threading.Tasks;
8using Windows.Foundation;
9
10using pdftron.Common;
11using pdftron.PDF;
12using pdftron.SDF;
13
14using PDFNetUniversalSamples.ViewModels;
15
16namespace PDFNetSamples
17{
18    public sealed class ElementReaderAdvTest : Sample
19    {
20        public ElementReaderAdvTest() :
21            base("ElementReaderAdv", "The sample shows how to use some of more advanced PDFNet features. The sample code illustrates how to extract text, paths, and images. The sample also shows how to do color conversion, image normalization, and how to process changes in the graphics state.")
22        {
23        }
24
25        public override IAsyncAction RunAsync()
26        {
27            return Task.Run(new System.Action(() => {
28                WriteLine("--------------------------------");
29                WriteLine("Starting ElementReaderAdv Test...");
30                WriteLine("--------------------------------\n");
31                try
32                {
33                    WriteLine("Extract page element information from all pages in the document.");
34                    string input_file_path = Path.Combine(InputPath, "newsletter.pdf");
35                    WriteLine("Opening input file " + input_file_path);
36                    PDFDoc doc = new PDFDoc(input_file_path);
37
38                    doc.InitSecurityHandler();
39
40                    int pgnum = doc.GetPageCount();
41                    PageIterator itr;
42
43                    ElementReader page_reader = new ElementReader();
44
45                    itr = doc.GetPageIterator(); //Read first page
46                    WriteLine(String.Format("Page {0:d} ----------------------------------------", itr.GetPageNumber()));
47
48                    pdftron.PDF.Rect crop_box = itr.Current().GetCropBox();
49                    crop_box.Normalize();
50
51                    WriteLine(String.Format(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2));
52                    WriteLine(String.Format(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height()));
53
54                    page_reader.Begin(itr.Current());
55                    ProcessElements(page_reader);
56                    page_reader.End();
57
58                    doc.Destroy();
59                    WriteLine("\nDone.");
60                }
61                catch (Exception e)
62                {
63                    WriteLine(GetExceptionMessage(e));
64                }
65
66                WriteLine("\n--------------------------------");
67                WriteLine("Done ElementReaderAdv Test.");
68                WriteLine("--------------------------------\n");
69            })).AsAsyncAction();
70        }
71
72        String m_buf;
73
74        String ProcessPath(ElementReader reader, Element path)
75        {
76            String result = "";
77            if (path.IsClippingPath())
78            {
79                result += ("This is a clipping path.\n");
80            }
81
82            PathData pathData = path.GetPathData();
83            pathData.get_pts();
84            double[] data = pathData.get_pts();// points;
85            int data_sz = data.Length;
86            //int data_sz = path.
87
88            byte[] opr = pathData.get_ops();//operators;
89            int opr_sz = opr.Length;
90
91            int opr_itr = 0, opr_end = opr_sz;
92            int data_itr = 0, data_end = data_sz;
93            double x1, y1, x2, y2, x3, y3;
94
95            // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
96
97            result += (" Path Data Points := \"\n");
98            for (; opr_itr < opr_end; ++opr_itr)
99            {
100                switch ((PathDataPathSegmentType)((int)opr[opr_itr]))
101                {
102                    case PathDataPathSegmentType.e_moveto:
103                        x1 = data[data_itr]; ++data_itr;
104                        y1 = data[data_itr]; ++data_itr;
105                        m_buf = String.Format("M{0:g5} {1:g5}", x1, y1);
106                        result += (m_buf + "\n");
107                        break;
108                    case PathDataPathSegmentType.e_lineto:
109                        x1 = data[data_itr]; ++data_itr;
110                        y1 = data[data_itr]; ++data_itr;
111                        m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1);
112                        result += (m_buf + "\n");
113                        break;
114                    case PathDataPathSegmentType.e_cubicto:
115                        x1 = data[data_itr]; ++data_itr;
116                        y1 = data[data_itr]; ++data_itr;
117                        x2 = data[data_itr]; ++data_itr;
118                        y2 = data[data_itr]; ++data_itr;
119                        x3 = data[data_itr]; ++data_itr;
120                        y3 = data[data_itr]; ++data_itr;
121                        m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}",
122                            new object[] { x1, y1, x2, y2, x3, y3 });
123                        result += (m_buf + "\n");
124                        break;
125                    case PathDataPathSegmentType.e_rect:
126                        {
127                            x1 = data[data_itr]; ++data_itr;
128                            y1 = data[data_itr]; ++data_itr;
129                            double w = data[data_itr]; ++data_itr;
130                            double h = data[data_itr]; ++data_itr;
131                            x2 = x1 + w;
132                            y2 = y1;
133                            x3 = x2;
134                            y3 = y1 + h;
135                            double x4 = x1;
136                            double y4 = y3;
137                            m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z",
138                                new object[] { x1, y1, x2, y2, x3, y3, x4, x3 });
139                            result += (m_buf);
140                            break;
141                        }
142                    case PathDataPathSegmentType.e_closepath:
143                        result += ("\n Close Path\n");
144                        break;
145                    default:
146                        System.Diagnostics.Debug.Assert(false);
147                        break;
148                }
149            }
150
151            result += ("\" ");
152
153            GState gs = path.GetGState();
154
155            // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
156            if (path.IsStroked())
157            {
158                result += ("Stroke path\n");
159
160                if (gs.GetStrokeColorSpace().GetType() == ColorSpaceType.e_pattern)
161                {
162                    result += ("Path has associated pattern\n");
163                }
164                else
165                {
166                    // Get stroke color (you can use PDFNet color conversion facilities)
167                    // ColorPt rgb = new ColorPt();
168                    // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
169                }
170            }
171            else
172            {
173                // Do not stroke path
174            }
175
176            if (path.IsFilled())
177            {
178                result += ("Fill path\n");
179
180                if (gs.GetFillColorSpace().GetType() == ColorSpaceType.e_pattern)
181                {
182                    result += ("Path has associated pattern\n");
183                }
184                else
185                {
186                    // ColorPt rgb = new ColorPt();
187                    // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
188                }
189            }
190            else
191            {
192                // Do not fill path
193            }
194
195            // Process any changes in graphics state  ---------------------------------
196
197            GSChangesIterator gs_itr = reader.GetChangesIterator();
198            for (; gs_itr.HasNext(); gs_itr.Next())
199            {
200                switch (gs_itr.Current())
201                {
202                    case GStateGStateAttribute.e_transform:
203                        // Get transform matrix for this element. Unlike path.GetCTM() 
204                        // that return full transformation matrix gs.GetTransform() return 
205                        // only the transformation matrix that was installed for this element.
206                        //
207                        // gs.GetTransform();
208                        break;
209                    case GStateGStateAttribute.e_line_width:
210                        // gs.GetLineWidth();
211                        break;
212                    case GStateGStateAttribute.e_line_cap:
213                        // gs.GetLineCap();
214                        break;
215                    case GStateGStateAttribute.e_line_join:
216                        // gs.GetLineJoin();
217                        break;
218                    case GStateGStateAttribute.e_flatness:
219                        break;
220                    case GStateGStateAttribute.e_miter_limit:
221                        // gs.GetMiterLimit();
222                        break;
223                    case GStateGStateAttribute.e_dash_pattern:
224                        {
225                            // double[] dashes;
226                            // gs.GetDashes(dashes);
227                            // gs.GetPhase()
228                            break;
229                        }
230                    case GStateGStateAttribute.e_fill_color:
231                        {
232                            if (gs.GetFillColorSpace().GetType() == ColorSpaceType.e_pattern &&
233                                 gs.GetFillPattern().GetType() != PatternColorType.e_shading)
234                            {
235                                //process the pattern data
236                                reader.PatternBegin(true);
237                                ProcessElements(reader);
238                                reader.End();
239                            }
240                            break;
241                        }
242                }
243            }
244            reader.ClearChangeList();
245            return result;
246        }
247
248        String ProcessText(ElementReader page_reader)
249        {
250            String result = "";
251            // Begin text element
252            result += ("Begin Text Block:\n");
253
254            Element element;
255            while ((element = page_reader.Next()) != null)
256            {
257                switch (element.GetType())
258                {
259                    case ElementType.e_text_end:
260                        // Finish the text block
261                        result += ("End Text Block.\n");
262                        return result;
263
264                    case ElementType.e_text:
265                        {
266                            GState gs = element.GetGState();
267
268                            ColorSpace cs_fill = gs.GetFillColorSpace();
269                            ColorPt fill = gs.GetFillColor();
270
271                            ColorPt outc = new ColorPt();
272                            cs_fill. Convert2RGB(fill, outc);
273
274
275                            ColorSpace cs_stroke = gs.GetStrokeColorSpace();
276                            ColorPt stroke = gs.GetStrokeColor();
277
278                            Font font = gs.GetFont();
279
280                            result += ("Font Name: ");
281                            result += (font.GetName() + "\n");
282                            // font.IsFixedWidth();
283                            // font.IsSerif();
284                            // font.IsSymbolic();
285                            // font.IsItalic();
286                            // ... 
287
288                            // double word_spacing = gs.GetWordSpacing();
289                            // double char_spacing = gs.GetCharSpacing();
290
291                            // Use element.GetCTM() if you are interested in the CTM 
292                            // (current transformation matrix).
293                            if (font.GetType() == FontType.e_Type3)
294                            {
295                                //type 3 font, process its data
296                                for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
297                                {
298                                    page_reader.Type3FontBegin(itr.Current());
299                                    ProcessElements(page_reader);
300                                    page_reader.End();
301                                }
302                            }
303
304                            else
305                            {
306                                Matrix2D ctm = element.GetCTM();
307
308                                Matrix2D text_mtx = element.GetTextMatrix();
309
310                                Matrix2D mtx = Matrix2D.Mult(ctm, text_mtx);
311                                double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
312                                double font_size = gs.GetFontSize();
313                                result += (String.Format(" Font Size: {0:f}\n", font_sz_scale_factor * font_size));
314
315                                ColorPt font_color = gs.GetFillColor();
316                                ColorSpace cs = gs.GetFillColorSpace();
317
318                                ColorPt rgb = new ColorPt();
319                                cs.Convert2RGB(font_color, rgb);
320                                //Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
321                                //	(byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
322
323                                result += (String.Format("Font Color(RGB): red={0:d} green={1:d} blue={2:d}\n",
324                                    (byte)(rgb.Get(0) * 255),
325                                    (byte)(rgb.Get(1) * 255),
326                                    (byte)(rgb.Get(2) * 255)));
327
328                                pdftron.Common.DoubleRef x, y;
329                                int char_code;
330
331                                for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
332                                {
333                                    result += ("Character code: ");
334                                    char_code = itr.Current().char_code;
335                                    result += (String.Format("{0}\n", (char)char_code));
336
337                                    x = new pdftron.Common.DoubleRef(itr.Current().x);		// character positioning information
338                                    y = new pdftron.Common.DoubleRef(itr.Current().y);
339
340                                    // To get the exact character positioning information you need to 
341                                    // concatenate current text matrix with CTM and then multiply 
342                                    // relative positioning coordinates with the resulting matrix.
343                                    //
344                                    mtx = Matrix2D.Mult(ctm, text_mtx);
345                                    mtx.Mult(x, y);
346
347                                    result += (String.Format(" Position: x={0:f} y={1:f}\n", x.Value, y.Value));
348                                }
349                            }
350                            break;
351                        }
352                }
353            }
354            return result;
355        }
356
357        int image_counter = 0;
358
359        String ProcessImage(Element image)
360        {
361            String result = "";
362            bool image_mask = image.IsImageMask();
363            bool interpolate = image.IsImageInterpolate();
364            int width = image.GetImageWidth();
365            int height = image.GetImageHeight();
366            int out_data_sz = width * height * 3;
367
368            result += (String.Format("\nImage: width=\"{0:d}\" height=\"{1:d}\"\n", width, height));
369
370            // Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)
371
372            ++image_counter;
373            /*
374             System.Drawing.Bitmap bmp = image.GetBitmap();
375             bmp.Save(output_path + "reader_img_extract_" + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
376             */
377
378            // Alternatively you can use GetImageData to read the raw (decoded) image data
379            // image.GetBitsPerComponent();	
380            // image.GetImageData();	// get raw image data
381            // another approach is to use Image2RGB filter that converts every image to 
382            // RGB format. This could save you time since you don't need to deal with color 
383            // conversions, image up-sampling, decoding etc.
384            // ----------------
385            //   Image2RGB img_conv = new Image2RGB(image);	// Extract and convert image to RGB 8-bpc format
386            //   FilterReader reader = new FilterReader(img_conv);			//   
387            //   byte[] image_data_out = new byte[out_data_sz];  // A buffer used to keep image data.
388            //   reader.Read(image_data_out);  // image_data_out contains RGB image data.
389            // ----------------
390            // Note that you don't need to read a whole image at a time. Alternatively
391            // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
392            // until the function returns 0.
393            return result;
394        }
395
396        String ProcessElements(ElementReader reader)
397        {
398            String resultMsg = "";
399            Element element;
400            while ((element = reader.Next()) != null)  // Read page contents
401            {
402                switch (element.GetType())
403                {
404                    case ElementType.e_path:          // Process path data...
405                        {
406                            resultMsg += ProcessPath(reader, element) + "\n";
407                            break;
408                        }
409                    case ElementType.e_text_begin:    // Process text strings...
410                        {
411                            resultMsg += ProcessText(reader) + "\n";
412                            break;
413                        }
414                    case ElementType.e_form:          // Process form XObjects
415                        {
416                            reader.FormBegin();
417                            resultMsg += ProcessElements(reader) + "\n";
418                            reader.End();
419                            break;
420                        }
421                    case ElementType.e_image:         // Process Images
422                        {
423                            resultMsg += ProcessImage(element) + "\n";
424                            break;
425                        }
426                }
427            }
428            // Print result msg
429            Write(resultMsg);
430
431            return "";
432        }
433    }
434}
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

ElementReaderAdv