ElementReaderAdv

Sample C# code for using Apryse Xamarin SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Xamarin SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5// A sample project illustrating some extraction capabilities of ElementReader
6// in more detail
7//---------------------------------------------------------------------------------------
8
9using System;
10using pdftron;
11using pdftron.Common;
12using pdftron.Filters;
13using pdftron.SDF;
14using pdftron.PDF;
15
16using NUnit.Framework;
17
18namespace MiscellaneousSamples
19{
20 /// <summary>
21 /// Summary description for Class1.
22 /// </summary>
23 [TestFixture]
24 public class ElementReaderAdvTest
25 {
26
27 // Relative path to the folder containing test files.
28 const string input_path = "TestFiles/";
29
30 static string m_buf;
31
32 static public void ProcessPath(ElementReader reader, Element path)
33 {
34 if (path.IsClippingPath())
35 {
36 Console.WriteLine("This is a clipping path");
37 }
38
39 PathData pathData = path.GetPathData();
40 double[] data = pathData.points;
41 int data_sz = data.Length;
42
43 byte[] opr = pathData.operators;
44 int opr_sz = opr.Length;
45
46 int opr_itr = 0, opr_end = opr_sz;
47 int data_itr = 0, data_end = data_sz;
48 double x1, y1, x2, y2, x3, y3;
49
50 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
51
52 Console.Write(" Path Data Points := \"");
53 for ( ; opr_itr < opr_end; ++opr_itr)
54 {
55 switch((PathData.PathSegmentType)((int)opr[opr_itr]))
56 {
57 case PathData.PathSegmentType.e_moveto:
58 x1 = data[data_itr]; ++data_itr;
59 y1 = data[data_itr]; ++data_itr;
60 m_buf = string.Format("M{0:n0} {1:n0}", x1, y1);
61 Console.Write(m_buf);
62 break;
63 case PathData.PathSegmentType.e_lineto:
64 x1 = data[data_itr]; ++data_itr;
65 y1 = data[data_itr]; ++data_itr;
66 m_buf = string.Format(" L{0:n0} {1:n0}", x1, y1);
67 Console.Write(m_buf);
68 break;
69 case PathData.PathSegmentType.e_cubicto:
70 x1 = data[data_itr]; ++data_itr;
71 y1 = data[data_itr]; ++data_itr;
72 x2 = data[data_itr]; ++data_itr;
73 y2 = data[data_itr]; ++data_itr;
74 x3 = data[data_itr]; ++data_itr;
75 y3 = data[data_itr]; ++data_itr;
76 m_buf = string.Format(" C{0:n0} {1:n0} {2:n0} {3:n0} {4:n0} {5:n0}",
77 new object[] {x1, y1, x2, y2, x3, y3});
78 Console.Write(m_buf);
79 break;
80 case PathData.PathSegmentType.e_rect:
81 {
82 x1 = data[data_itr]; ++data_itr;
83 y1 = data[data_itr]; ++data_itr;
84 double w = data[data_itr]; ++data_itr;
85 double h = data[data_itr]; ++data_itr;
86 x2 = x1 + w;
87 y2 = y1;
88 x3 = x2;
89 y3 = y1 + h;
90 double x4 = x1;
91 double y4 = y3;
92 m_buf = string.Format("M{0:n0} {1:n0} L{2:n0} {3:n0} L{4:n0} {5:n0} L{6:n0} {7:n0} Z",
93 new object[] {x1, y1, x2, y2, x3, y3, x4, y4});
94 Console.Write(m_buf);
95 break;
96 }
97 case PathData.PathSegmentType.e_closepath:
98 Console.WriteLine(" Close Path");
99 break;
100 default:
101 System.Diagnostics.Debug.Assert(false);
102 break;
103 }
104 }
105
106 Console.Write("\" ");
107
108 GState gs = path.GetGState();
109
110 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
111 if (path.IsStroked())
112 {
113 Console.WriteLine("Stroke path");
114
115 if (gs.GetStrokeColorSpace().GetType() == ColorSpace.Type.e_pattern)
116 {
117 Console.WriteLine("Path has associated pattern");
118 }
119 else
120 {
121 // Get stroke color (you can use PDFNet color conversion facilities)
122 // ColorPt rgb = new ColorPt();
123 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
124 }
125 }
126 else
127 {
128 // Do not stroke path
129 }
130
131 if (path.IsFilled())
132 {
133 Console.WriteLine("Fill path");
134
135 if (gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern)
136 {
137 Console.WriteLine("Path has associated pattern");
138 }
139 else
140 {
141 // ColorPt rgb = new ColorPt();
142 // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
143 }
144 }
145 else
146 {
147 // Do not fill path
148 }
149
150 // Process any changes in graphics state ---------------------------------
151
152 GSChangesIterator gs_itr = reader.GetChangesIterator();
153 for ( ; gs_itr.HasNext(); gs_itr.Next())
154 {
155 switch(gs_itr.Current())
156 {
157 case GState.GStateAttribute.e_transform :
158 // Get transform matrix for this element. Unlike path.GetCTM()
159 // that return full transformation matrix gs.GetTransform() return
160 // only the transformation matrix that was installed for this element.
161 //
162 // gs.GetTransform();
163 break;
164 case GState.GStateAttribute.e_line_width :
165 // gs.GetLineWidth();
166 break;
167 case GState.GStateAttribute.e_line_cap :
168 // gs.GetLineCap();
169 break;
170 case GState.GStateAttribute.e_line_join :
171 // gs.GetLineJoin();
172 break;
173 case GState.GStateAttribute.e_flatness :
174 break;
175 case GState.GStateAttribute.e_miter_limit :
176 // gs.GetMiterLimit();
177 break;
178 case GState.GStateAttribute.e_dash_pattern :
179 {
180 // double[] dashes;
181 // gs.GetDashes(dashes);
182 // gs.GetPhase()
183 break;
184 }
185 case GState.GStateAttribute.e_fill_color:
186 {
187 if ( gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern &&
188 gs.GetFillPattern().GetType() != PatternColor.Type.e_shading)
189 {
190 //process the pattern data
191 reader.PatternBegin(true);
192 ProcessElements(reader);
193 reader.End();
194 }
195 break;
196 }
197 }
198 }
199 reader.ClearChangeList();
200 }
201
202 static public void ProcessText(ElementReader page_reader)
203 {
204 // Begin text element
205 Console.WriteLine("Begin Text Block:");
206
207 Element element;
208 while ((element = page_reader.Next()) != null)
209 {
210 switch (element.GetType())
211 {
212 case Element.Type.e_text_end:
213 // Finish the text block
214 Console.WriteLine("End Text Block.");
215 return;
216
217 case Element.Type.e_text:
218 {
219 GState gs = element.GetGState();
220
221 ColorSpace cs_fill = gs.GetFillColorSpace();
222 ColorPt fill = gs.GetFillColor();
223
224 ColorPt outc = new ColorPt();
225 cs_fill.Convert2RGB(fill, outc);
226
227
228 ColorSpace cs_stroke = gs.GetStrokeColorSpace();
229 ColorPt stroke = gs.GetStrokeColor();
230
231 Font font = gs.GetFont();
232
233 Console.Write("Font Name: ");
234 Console.WriteLine(font.GetName());
235 // font.IsFixedWidth();
236 // font.IsSerif();
237 // font.IsSymbolic();
238 // font.IsItalic();
239 // ...
240
241 // double word_spacing = gs.GetWordSpacing();
242 // double char_spacing = gs.GetCharSpacing();
243
244 // Use element.GetCTM() if you are interested in the CTM
245 // (current transformation matrix).
246 if (font.GetType() == Font.Type.e_Type3)
247 {
248 //type 3 font, process its data
249 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
250 {
251 page_reader.Type3FontBegin(itr.Current());
252 ProcessElements(page_reader);
253 page_reader.End();
254 }
255 }
256
257 else
258 {
259
260 Matrix2D ctm = element.GetCTM();
261
262 Matrix2D text_mtx = element.GetTextMatrix();
263
264 /*
265 Matrix2D mtx = ctm * text_mtx;
266 double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
267 double font_size = gs.GetFontSize();
268 Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size);
269
270 ColorPt font_color = gs.GetFillColor();
271 ColorSpace cs = gs.GetFillColorSpace();
272
273 ColorPt rgb = new ColorPt();
274 cs.Convert2RGB(font_color, rgb);
275 Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
276 (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
277
278
279 Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}",
280 (byte)(rgb.Get(0)*255),
281 (byte)(rgb.Get(1)*255),
282 (byte)(rgb.Get(2)*255));
283 */
284
285 double x, y;
286 int char_code;
287
288 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
289 {
290 Console.Write("Character code: ");
291 char_code = itr.Current().char_code;
292 if (char_code >= 32 || char_code <= 127)
293 {
294 // Print if in ASCII range...
295 Console.Write((char)char_code);
296 }
297
298 x = itr.Current().x; // character positioning information
299 y = itr.Current().y;
300
301 // To get the exact character positioning information you need to
302 // concatenate current text matrix with CTM and then multiply
303 // relative positioning coordinates with the resulting matrix.
304 //
305 Matrix2D mtx2 = ctm * text_mtx;
306 mtx2.Mult(ref x, ref y);
307 // Console.WriteLine(" Position: x={0:f} y={1:f}", x, y);
308 }
309 }
310
311 Console.WriteLine();
312 break;
313 }
314 }
315 }
316 }
317
318 static int image_counter = 0;
319
320 static public void ProcessImage(Element image)
321 {
322 bool image_mask = image.IsImageMask();
323 bool interpolate = image.IsImageInterpolate();
324 int width = image.GetImageWidth();
325 int height = image.GetImageHeight();
326 int out_data_sz = width * height * 3;
327
328 Console.WriteLine("Image: width=\"{0:d}\" height=\"{1:d}\"", width, height);
329
330 // Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)
331
332// ++image_counter;
333// System.Drawing.Bitmap bmp = image.GetBitmap();
334// bmp.Save(Utils.CreateExternalFile("reader_img_extract_") + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
335//
336 // Alternatively you can use GetImageData to read the raw (decoded) image data
337 // image.GetBitsPerComponent();
338 // image.GetImageData(); // get raw image data
339 // another approach is to use Image2RGB filter that converts every image to
340 // RGB format. This could save you time since you don't need to deal with color
341 // conversions, image up-sampling, decoding etc.
342 // ----------------
343 Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
344 FilterReader reader = new FilterReader(img_conv); //
345 byte[] image_data_out = new byte[out_data_sz]; // A buffer used to keep image data.
346 reader.Read(image_data_out); // image_data_out contains RGB image data.
347 // ----------------
348 // Note that you don't need to read a whole image at a time. Alternatively
349 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
350 // until the function returns 0.
351 }
352
353 static void ProcessElements(ElementReader reader)
354 {
355 Element element;
356
357 while ((element = reader.Next()) != null) // Read page contents
358 {
359 switch (element.GetType())
360 {
361 case Element.Type.e_path: // Process path data...
362 {
363 ProcessPath(reader, element);
364 break;
365 }
366 case Element.Type.e_text_begin: // Process text strings...
367 {
368 ProcessText(reader);
369 break;
370 }
371 case Element.Type.e_form: // Process form XObjects
372 {
373 reader.FormBegin();
374 ProcessElements(reader);
375 reader.End();
376 break;
377 }
378 case Element.Type.e_image: // Process Images
379 {
380 ProcessImage(element);
381 break;
382 }
383 }
384 }
385 }
386
387 /// <summary>
388 /// The main entry point for the application.
389 /// </summary>
390 [Test]
391 public static void Sample()
392 {
393 try
394 {
395
396 Console.WriteLine("-------------------------------------------------");
397 Console.WriteLine("Extract page element information from all ");
398 Console.WriteLine("pages in the document.");
399
400 // Open the test file
401 using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
402 {
403 doc.InitSecurityHandler();
404
405 int pgnum = doc.GetPageCount();
406 PageIterator itr;
407
408 using (ElementReader page_reader = new ElementReader())
409 {
410 for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page
411 {
412 Console.WriteLine("Page {0:d}----------------------------------------",
413 itr.GetPageNumber());
414
415 Rect crop_box = itr.Current().GetCropBox();
416 crop_box.Normalize();
417
418 // Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2);
419 // Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height());
420
421 page_reader.Begin(itr.Current());
422 ProcessElements(page_reader);
423 page_reader.End();
424 }
425 }
426
427 Console.WriteLine("Done.");
428 }
429 }
430 catch (PDFNetException e)
431 {
432 Console.WriteLine(e.Message);
433 Assert.True(false);
434 }
435 }
436 }
437}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales