ElementReaderAdv

Sample C# code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our Xamarin SDK and PDF Data Extraction SDK Capabilities.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5// A sample project illustrating some extraction capabilities of ElementReader
6// in more detail
7//---------------------------------------------------------------------------------------
8
9using System;
10using pdftron;
11using pdftron.Common;
12using pdftron.Filters;
13using pdftron.SDF;
14using pdftron.PDF;
15
16using NUnit.Framework;
17
18namespace MiscellaneousSamples
19{
20	/// <summary>
21	/// Summary description for Class1.
22	/// </summary>
23	[TestFixture]
24	public class ElementReaderAdvTest
25	{
26		
27		// Relative path to the folder containing test files.
28		const string input_path =  "TestFiles/";
29
30		static string m_buf;
31
32		static public void ProcessPath(ElementReader reader, Element path)
33		{
34			if (path.IsClippingPath())
35			{
36				Console.WriteLine("This is a clipping path");
37			}
38
39            PathData pathData = path.GetPathData();
40			double[] data = pathData.points;
41			int data_sz = data.Length;
42
43            byte[] opr = pathData.operators;
44			int opr_sz = opr.Length;
45
46			int opr_itr = 0, opr_end = opr_sz;
47			int data_itr = 0, data_end = data_sz;
48			double x1, y1, x2, y2, x3, y3;
49
50			// Use path.GetCTM() if you are interested in CTM (current transformation matrix).
51
52			Console.Write(" Path Data Points := \"");
53			for ( ; opr_itr < opr_end; ++opr_itr)
54			{
55				switch((PathData.PathSegmentType)((int)opr[opr_itr]))
56				{
57                    case PathData.PathSegmentType.e_moveto:
58						x1 = data[data_itr]; ++data_itr;
59						y1 = data[data_itr]; ++data_itr;
60						m_buf = string.Format("M{0:n0} {1:n0}", x1, y1);
61						Console.Write(m_buf);
62						break;
63                    case PathData.PathSegmentType.e_lineto:
64						x1 = data[data_itr]; ++data_itr;
65						y1 = data[data_itr]; ++data_itr;
66						m_buf = string.Format(" L{0:n0} {1:n0}", x1, y1);
67						Console.Write(m_buf);
68						break;
69                    case PathData.PathSegmentType.e_cubicto:
70						x1 = data[data_itr]; ++data_itr;
71						y1 = data[data_itr]; ++data_itr;
72						x2 = data[data_itr]; ++data_itr;
73						y2 = data[data_itr]; ++data_itr;
74						x3 = data[data_itr]; ++data_itr;
75						y3 = data[data_itr]; ++data_itr;
76						m_buf = string.Format(" C{0:n0} {1:n0} {2:n0} {3:n0} {4:n0} {5:n0}",
77							new object[] {x1, y1, x2, y2, x3, y3});
78						Console.Write(m_buf);
79						break;
80                    case PathData.PathSegmentType.e_rect:
81					{
82						x1 = data[data_itr]; ++data_itr;
83						y1 = data[data_itr]; ++data_itr;
84						double w = data[data_itr]; ++data_itr;
85						double h = data[data_itr]; ++data_itr;
86						x2 = x1 + w;
87						y2 = y1;
88						x3 = x2;
89						y3 = y1 + h;
90						double x4 = x1; 
91						double y4 = y3;
92						m_buf = string.Format("M{0:n0} {1:n0} L{2:n0} {3:n0} L{4:n0} {5:n0} L{6:n0} {7:n0} Z",
93							new object[] {x1, y1, x2, y2, x3, y3, x4, y4});
94						Console.Write(m_buf);
95						break;
96					}
97                    case PathData.PathSegmentType.e_closepath:
98						Console.WriteLine(" Close Path");
99						break;
100					default: 
101						System.Diagnostics.Debug.Assert(false);
102						break;
103				}	
104			}
105
106			Console.Write("\" ");
107
108			GState gs = path.GetGState();
109
110			// Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
111			if (path.IsStroked()) 
112			{
113				Console.WriteLine("Stroke path"); 
114
115				if (gs.GetStrokeColorSpace().GetType() == ColorSpace.Type.e_pattern)
116				{
117					Console.WriteLine("Path has associated pattern"); 
118				}
119				else
120				{
121					// Get stroke color (you can use PDFNet color conversion facilities)
122					// ColorPt rgb = new ColorPt();
123					// gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
124				}
125			}
126			else 
127			{
128				// Do not stroke path
129			}
130
131			if (path.IsFilled())
132			{
133				Console.WriteLine("Fill path"); 
134
135				if (gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern)
136				{		
137					Console.WriteLine("Path has associated pattern"); 
138				}
139				else
140				{
141					// ColorPt rgb = new ColorPt();
142					// gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
143				}        
144			}
145			else 
146			{
147				// Do not fill path
148			}
149
150			// Process any changes in graphics state  ---------------------------------
151
152			GSChangesIterator gs_itr = reader.GetChangesIterator();
153			for ( ; gs_itr.HasNext(); gs_itr.Next()) 
154			{
155				switch(gs_itr.Current())
156				{
157					case GState.GStateAttribute.e_transform :
158						// Get transform matrix for this element. Unlike path.GetCTM() 
159						// that return full transformation matrix gs.GetTransform() return 
160						// only the transformation matrix that was installed for this element.
161						//
162						// gs.GetTransform();
163						break;
164					case GState.GStateAttribute.e_line_width :
165						// gs.GetLineWidth();
166						break;
167					case GState.GStateAttribute.e_line_cap :
168						// gs.GetLineCap();
169						break;
170					case GState.GStateAttribute.e_line_join :
171						// gs.GetLineJoin();
172						break;
173					case GState.GStateAttribute.e_flatness :	
174						break;
175					case GState.GStateAttribute.e_miter_limit :
176						// gs.GetMiterLimit();
177						break;
178					case GState.GStateAttribute.e_dash_pattern :
179					{
180						// double[] dashes;
181						// gs.GetDashes(dashes);
182						// gs.GetPhase()
183						break;
184					}
185					case GState.GStateAttribute.e_fill_color:
186					{
187						if ( gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern &&
188							 gs.GetFillPattern().GetType() != PatternColor.Type.e_shading)
189						{	
190							//process the pattern data
191							reader.PatternBegin(true);
192							ProcessElements(reader);
193							reader.End();
194						}
195						break;
196					}
197				}
198			}
199			reader.ClearChangeList();
200		}
201
202		static public void ProcessText(ElementReader page_reader) 
203		{
204			// Begin text element
205			Console.WriteLine("Begin Text Block:");
206
207			Element element; 
208			while ((element = page_reader.Next()) != null) 
209			{
210				switch (element.GetType())
211				{
212					case Element.Type.e_text_end: 
213						// Finish the text block
214						Console.WriteLine("End Text Block.");
215						return;
216
217					case Element.Type.e_text:
218					{
219						GState gs = element.GetGState();
220
221						ColorSpace cs_fill = gs.GetFillColorSpace();
222						ColorPt fill = gs.GetFillColor();
223
224						ColorPt outc = new ColorPt();
225						cs_fill.Convert2RGB(fill, outc);
226
227
228						ColorSpace cs_stroke = gs.GetStrokeColorSpace();
229						ColorPt stroke = gs.GetStrokeColor();
230
231						Font font = gs.GetFont();
232
233						Console.Write("Font Name: ");
234						Console.WriteLine(font.GetName());
235						// font.IsFixedWidth();
236						// font.IsSerif();
237						// font.IsSymbolic();
238						// font.IsItalic();
239						// ... 
240
241						// double word_spacing = gs.GetWordSpacing();
242						// double char_spacing = gs.GetCharSpacing();
243
244						// Use element.GetCTM() if you are interested in the CTM 
245						// (current transformation matrix).
246						if (font.GetType() == Font.Type.e_Type3)
247						{
248							//type 3 font, process its data
249							for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
250							{
251								page_reader.Type3FontBegin(itr.Current());
252								ProcessElements(page_reader);
253								page_reader.End();
254							}
255						}
256
257						else
258						{
259
260							Matrix2D ctm = element.GetCTM();
261
262							Matrix2D text_mtx = element.GetTextMatrix();
263
264                            /*
265                            Matrix2D mtx = ctm * text_mtx;
266                            double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
267                            double font_size = gs.GetFontSize();
268                            Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size);
269
270                            ColorPt font_color = gs.GetFillColor();
271                            ColorSpace cs = gs.GetFillColorSpace();
272
273                            ColorPt rgb = new ColorPt();
274                            cs.Convert2RGB(font_color, rgb);
275                            Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
276                            (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
277                                
278
279                            Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", 
280							(byte)(rgb.Get(0)*255),
281							(byte)(rgb.Get(1)*255),
282							(byte)(rgb.Get(2)*255));
283                            */
284
285                            double x, y;
286							int char_code; 
287											
288							for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
289							{
290								Console.Write("Character code: ");
291								char_code = itr.Current().char_code;
292                                if (char_code >= 32 || char_code <= 127)
293                                { 
294                                    // Print if in ASCII range...
295                                    Console.Write((char)char_code);
296                                }
297
298								x = itr.Current().x;		// character positioning information
299								y = itr.Current().y;
300
301								// To get the exact character positioning information you need to 
302								// concatenate current text matrix with CTM and then multiply 
303								// relative positioning coordinates with the resulting matrix.
304								//
305								Matrix2D mtx2 = ctm * text_mtx;
306								mtx2.Mult(ref x, ref y);
307								// Console.WriteLine(" Position: x={0:f} y={1:f}", x, y);
308							}
309						}
310
311						Console.WriteLine();
312						break;
313					}
314				}
315			}
316		}
317
318		static int image_counter = 0;
319
320		static public void ProcessImage(Element image)  
321		{
322			bool image_mask = image.IsImageMask();
323			bool interpolate = image.IsImageInterpolate();
324			int width = image.GetImageWidth();
325			int height = image.GetImageHeight();
326			int out_data_sz = width * height * 3;
327
328			Console.WriteLine("Image: width=\"{0:d}\" height=\"{1:d}\"", width, height);
329
330			// Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)
331
332// 			++image_counter;
333// 			System.Drawing.Bitmap bmp = image.GetBitmap();
334// 			bmp.Save(Utils.CreateExternalFile("reader_img_extract_") + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
335// 
336			// Alternatively you can use GetImageData to read the raw (decoded) image data
337			// image.GetBitsPerComponent();	
338			// image.GetImageData();	// get raw image data
339			// another approach is to use Image2RGB filter that converts every image to 
340			// RGB format. This could save you time since you don't need to deal with color 
341			// conversions, image up-sampling, decoding etc.
342			// ----------------
343			   Image2RGB img_conv = new Image2RGB(image);	// Extract and convert image to RGB 8-bpc format
344			   FilterReader reader = new FilterReader(img_conv);			//   
345			   byte[] image_data_out = new byte[out_data_sz];  // A buffer used to keep image data.
346			   reader.Read(image_data_out);  // image_data_out contains RGB image data.
347			// ----------------
348			// Note that you don't need to read a whole image at a time. Alternatively
349			// you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
350			// until the function returns 0. 
351		}
352
353	static void ProcessElements(ElementReader reader) 
354	{
355		Element element;
356
357		while ((element = reader.Next()) != null)  // Read page contents
358		{
359			switch (element.GetType())
360			{
361				case Element.Type.e_path:          // Process path data...
362				{
363					ProcessPath(reader, element);
364					break; 
365				}
366				case Element.Type.e_text_begin:    // Process text strings...
367				{
368					ProcessText(reader);
369					break;
370				}
371				case Element.Type.e_form:          // Process form XObjects
372				{
373					reader.FormBegin(); 
374					ProcessElements(reader);
375					reader.End(); 
376					break; 
377				}
378				case Element.Type.e_image:         // Process Images
379				{
380					ProcessImage(element);
381					break; 
382				}	
383			}
384		}
385	}
386
387		/// <summary>
388		/// The main entry point for the application.
389		/// </summary>
390		[Test]
391		public static void Sample()
392		{
393			try
394			{
395
396				Console.WriteLine("-------------------------------------------------");
397				Console.WriteLine("Extract page element information from all ");
398				Console.WriteLine("pages in the document.");
399
400				// Open the test file
401				using (PDFDoc doc = new PDFDoc(Utils.GetAssetTempFile(input_path + "newsletter.pdf")))
402				{
403					doc.InitSecurityHandler();
404
405					int pgnum = doc.GetPageCount();
406					PageIterator itr;
407
408					using (ElementReader page_reader = new ElementReader())
409					{
410						for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())		//  Read every page
411						{				
412							Console.WriteLine("Page {0:d}----------------------------------------",
413								itr.GetPageNumber());
414
415							Rect crop_box = itr.Current().GetCropBox();
416							crop_box.Normalize();
417
418							// Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2);
419							// Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height());
420
421							page_reader.Begin(itr.Current());
422							ProcessElements(page_reader);
423							page_reader.End(); 
424						}
425					}
426
427					Console.WriteLine("Done.");
428				}
429			}
430			catch (PDFNetException e)
431			{
432				Console.WriteLine(e.Message);
433				Assert.True(false);
434			}
435		}
436	}
437}
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

ElementReaderAdv