ElementReaderAdv

Sample C# code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our Xamarin SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5// A sample project illustrating some extraction capabilities of ElementReader
6// in more detail
7//---------------------------------------------------------------------------------------
8
9using System;
10using pdftron;
11using pdftron.Common;
12using pdftron.Filters;
13using pdftron.SDF;
14using pdftron.PDF;
15
16using NUnit.Framework;
17
18namespace MiscellaneousSamples
19{
20 /// <summary>
21 /// Summary description for Class1.
22 /// </summary>
23 [TestFixture]
24 public class ElementReaderAdvTest
25 {
26
27 // Relative path to the folder containing test files.
28 const string input_path = "TestFiles/";
29
30 static string m_buf;
31
32 static public void ProcessPath(ElementReader reader, Element path)
33 {
34 if (path.IsClippingPath())
35 {
36 Console.WriteLine("This is a clipping path");
37 }
38
39 PathData pathData = path.GetPathData();
40 double[] data = pathData.points;
41 int data_sz = data.Length;
42
43 byte[] opr = pathData.operators;
44 int opr_sz = opr.Length;
45
46 int opr_itr = 0, opr_end = opr_sz;
47 int data_itr = 0, data_end = data_sz;
48 double x1, y1, x2, y2, x3, y3;
49
50 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
51
52 Console.Write(" Path Data Points := \"");
53 for ( ; opr_itr < opr_end; ++opr_itr)
54 {
55 switch((PathData.PathSegmentType)((int)opr[opr_itr]))
56 {
57 case PathData.PathSegmentType.e_moveto:
58 x1 = data[data_itr]; ++data_itr;
59 y1 = data[data_itr]; ++data_itr;
60 m_buf = string.Format("M{0:n0} {1:n0}", x1, y1);
61 Console.Write(m_buf);
62 break;
63 case PathData.PathSegmentType.e_lineto:
64 x1 = data[data_itr]; ++data_itr;
65 y1 = data[data_itr]; ++data_itr;
66 m_buf = string.Format(" L{0:n0} {1:n0}", x1, y1);
67 Console.Write(m_buf);
68 break;
69 case PathData.PathSegmentType.e_cubicto:
70 x1 = data[data_itr]; ++data_itr;
71 y1 = data[data_itr]; ++data_itr;
72 x2 = data[data_itr]; ++data_itr;
73 y2 = data[data_itr]; ++data_itr;
74 x3 = data[data_itr]; ++data_itr;
75 y3 = data[data_itr]; ++data_itr;
76 m_buf = string.Format(" C{0:n0} {1:n0} {2:n0} {3:n0} {4:n0} {5:n0}",
77 new object[] {x1, y1, x2, y2, x3, y3});
78 Console.Write(m_buf);
79 break;
80 case PathData.PathSegmentType.e_rect:
81 {
82 x1 = data[data_itr]; ++data_itr;
83 y1 = data[data_itr]; ++data_itr;
84 double w = data[data_itr]; ++data_itr;
85 double h = data[data_itr]; ++data_itr;
86 x2 = x1 + w;
87 y2 = y1;
88 x3 = x2;
89 y3 = y1 + h;
90 double x4 = x1;
91 double y4 = y3;
92 m_buf = string.Format("M{0:n0} {1:n0} L{2:n0} {3:n0} L{4:n0} {5:n0} L{6:n0} {7:n0} Z",
93 new object[] {x1, y1, x2, y2, x3, y3, x4, y4});
94 Console.Write(m_buf);
95 break;
96 }
97 case PathData.PathSegmentType.e_closepath:
98 Console.WriteLine(" Close Path");
99 break;
100 default:
101 System.Diagnostics.Debug.Assert(false);
102 break;
103 }
104 }
105
106 Console.Write("\" ");
107
108 GState gs = path.GetGState();
109
110 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
111 if (path.IsStroked())
112 {
113 Console.WriteLine("Stroke path");
114
115 if (gs.GetStrokeColorSpace().GetType() == ColorSpace.Type.e_pattern)
116 {
117 Console.WriteLine("Path has associated pattern");
118 }
119 else
120 {
121 // Get stroke color (you can use PDFNet color conversion facilities)
122 // ColorPt rgb = new ColorPt();
123 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
124 }
125 }
126 else
127 {
128 // Do not stroke path
129 }
130
131 if (path.IsFilled())
132 {
133 Console.WriteLine("Fill path");
134
135 if (gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern)
136 {
137 Console.WriteLine("Path has associated pattern");
138 }
139 else
140 {
141 // ColorPt rgb = new ColorPt();
142 // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
143 }
144 }
145 else
146 {
147 // Do not fill path
148 }
149
150 // Process any changes in graphics state ---------------------------------
151
152 GSChangesIterator gs_itr = reader.GetChangesIterator();
153 for ( ; gs_itr.HasNext(); gs_itr.Next())
154 {
155 switch(gs_itr.Current())
156 {
157 case GState.GStateAttribute.e_transform :
158 // Get transform matrix for this element. Unlike path.GetCTM()
159 // that return full transformation matrix gs.GetTransform() return
160 // only the transformation matrix that was installed for this element.
161 //
162 // gs.GetTransform();
163 break;
164 case GState.GStateAttribute.e_line_width :
165 // gs.GetLineWidth();
166 break;
167 case GState.GStateAttribute.e_line_cap :
168 // gs.GetLineCap();
169 break;
170 case GState.GStateAttribute.e_line_join :
171 // gs.GetLineJoin();
172 break;
173 case GState.GStateAttribute.e_flatness :
174 break;
175 case GState.GStateAttribute.e_miter_limit :
176 // gs.GetMiterLimit();
177 break;
178 case GState.GStateAttribute.e_dash_pattern :
179 {
180 // double[] dashes;
181 // gs.GetDashes(dashes);
182 // gs.GetPhase()
183 break;
184 }
185 case GState.GStateAttribute.e_fill_color:
186 {
187 if ( gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern &&
188 gs.GetFillPattern().GetType() != PatternColor.Type.e_shading)
189 {
190 //process the pattern data
191 reader.PatternBegin(true);
192 ProcessElements(reader);
193 reader.End();
194 }
195 break;
196 }
197 }
198 }
199 reader.ClearChangeList();
200 }
201
202 static public void ProcessText(ElementReader page_reader)
203 {
204 // Begin text element
205 Console.WriteLine("Begin Text Block:");
206
207 Element element;
208 while ((element = page_reader.Next()) != null)
209 {
210 switch (element.GetType())
211 {
212 case Element.Type.e_text_end:
213 // Finish the text block
214 Console.WriteLine("End Text Block.");
215 return;
216
217 case Element.Type.e_text:
218 {
219 GState gs = element.GetGState();
220
221 ColorSpace cs_fill = gs.GetFillColorSpace();
222 ColorPt fill = gs.GetFillColor();
223
224 ColorPt outc = new ColorPt();
225 cs_fill.Convert2RGB(fill, outc);
226
227
228 ColorSpace cs_stroke = gs.GetStrokeColorSpace();
229 ColorPt stroke = gs.GetStrokeColor();
230
231 Font font = gs.GetFont();
232
233 Console.Write("Font Name: ");
234 Console.WriteLine(font.GetName());
235 // font.IsFixedWidth();
236 // font.IsSerif();
237 // font.IsSymbolic();
238 // font.IsItalic();
239 // ...
240
241 // double word_spacing = gs.GetWordSpacing();
242 // double char_spacing = gs.GetCharSpacing();
243
244 // Use element.GetCTM() if you are interested in the CTM
245 // (current transformation matrix).
246 if (font.GetType() == Font.Type.e_Type3)
247 {
248 //type 3 font, process its data
249 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
250 {
251 page_reader.Type3FontBegin(itr.Current());
252 ProcessElements(page_reader);
253 page_reader.End();
254 }
255 }
256
257 else
258 {
259
260 Matrix2D ctm = element.GetCTM();
261
262 Matrix2D text_mtx = element.GetTextMatrix();
263
264 /*
265 Matrix2D mtx = ctm * text_mtx;
266 double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
267 double font_size = gs.GetFontSize();
268 Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size);
269
270 ColorPt font_color = gs.GetFillColor();
271 ColorSpace cs = gs.GetFillColorSpace();
272
273 ColorPt rgb = new ColorPt();
274 cs.Convert2RGB(font_color, rgb);
275 Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
276 (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
277
278
279 Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}",
280 (byte)(rgb.Get(0)*255),
281 (byte)(rgb.Get(1)*255),
282 (byte)(rgb.Get(2)*255));
283 */
284
285 double x, y;
286 int char_code;
287
288 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
289 {
290 Console.Write("Character code: ");
291 char_code = itr.Current().char_code;
292 if (char_code >= 32 || char_code <= 127)
293 {
294 // Print if in ASCII range...
295 Console.Write((char)char_code);
296 }
297
298 x = itr.Current().x; // character positioning information
299 y = itr.Current().y;
300
301 // To get the exact character positioning information you need to
302 // concatenate current text matrix with CTM and then multiply
303 // relative positioning coordinates with the resulting matrix.
304 //
305 Matrix2D mtx2 = ctm * text_mtx;
306 mtx2.Mult(ref x, ref y);
307 // Console.WriteLine(" Position: x={0:f} y={1:f}", x, y);
308 }
309 }
310
311 Console.WriteLine();
312 break;
313 }
314 }
315 }
316 }
317
318 static int image_counter = 0;
319
320 static public void ProcessImage(Element image)
321 {
322 bool image_mask = image.IsImageMask();
323 bool interpolate = image.IsImageInterpolate();
324 int width = image.GetImageWidth();
325 int height = image.GetImageHeight();
326 int out_data_sz = width * height * 3;
327
328 Console.WriteLine("Image: width=\"{0:d}\" height=\"{1:d}\"", width, height);
329
330 // Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)
331
332// ++image_counter;
333// System.Drawing.Bitmap bmp = image.GetBitmap();
334// bmp.Save(Utils.CreateExternalFile("reader_img_extract_") + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
335//
336 // Alternatively you can use GetImageData to read the raw (decoded) image data
337 // image.GetBitsPerComponent();
338 // image.GetImageData(); // get raw image data
339 // another approach is to use Image2RGB filter that converts every image to
340 // RGB format. This could save you time since you don't need to deal with color
341 // conversions, image up-sampling, decoding etc.
342 // ----------------
343 Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
344 FilterReader reader = new FilterReader(img_conv); //
345 byte[] image_data_out = new byte[out_data_sz]; // A buffer used to keep image data.
346 reader.Read(image_data_out); // image_data_out contains RGB image data.
347 // ----------------
348 // Note that you don't need to read a whole image at a time. Alternatively
349 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
350 // until the function returns 0.
351 }
352
353 static void ProcessElements(ElementReader reader)
354 {
355 Element element;
356
357 while ((element = reader.Next()) != null) // Read page contents
358 {
359 switch (element.GetType())
360 {
361 case Element.Type.e_path: // Process path data...
362 {
363 ProcessPath(reader, element);
364 break;
365 }
366 case Element.Type.e_text_begin: // Process text strings...
367 {
368 ProcessText(reader);
369 break;
370 }
371 case Element.Type.e_form: // Process form XObjects
372 {
373 reader.FormBegin();
374 ProcessElements(reader);
375 reader.End();
376 break;
377 }
378 case Element.Type.e_image: // Process Images
379 {
380 ProcessImage(element);
381 break;
382 }
383 }
384 }
385 }
386
387 /// <summary>
388 /// The main entry point for the application.
389 /// </summary>
390 [Test]
391 public static void Sample()
392 {
393 try
394 {
395
396 Console.WriteLine("-------------------------------------------------");
397 Console.WriteLine("Extract page element information from all ");
398 Console.WriteLine("pages in the document.");
399
400 // Open the test file
401 using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
402 {
403 doc.InitSecurityHandler();
404
405 int pgnum = doc.GetPageCount();
406 PageIterator itr;
407
408 using (ElementReader page_reader = new ElementReader())
409 {
410 for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page
411 {
412 Console.WriteLine("Page {0:d}----------------------------------------",
413 itr.GetPageNumber());
414
415 Rect crop_box = itr.Current().GetCropBox();
416 crop_box.Normalize();
417
418 // Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2);
419 // Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height());
420
421 page_reader.Begin(itr.Current());
422 ProcessElements(page_reader);
423 page_reader.End();
424 }
425 }
426
427 Console.WriteLine("Done.");
428 }
429 }
430 catch (PDFNetException e)
431 {
432 Console.WriteLine(e.Message);
433 Assert.True(false);
434 }
435 }
436 }
437}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales