PDF Data Extraction - Images, Text, Paths - Ruby Sample Code

Sample code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5// A sample project illustrating some extraction capabilities of ElementReader
6// in more detail
7//---------------------------------------------------------------------------------------
8
9using System;
10using pdftron;
11using pdftron.Common;
12using pdftron.Filters;
13using pdftron.SDF;
14using pdftron.PDF;
15
16namespace ElementReaderAdvTestCS
17{
18	/// <summary>
19	/// Summary description for Class1.
20	/// </summary>
21	class Class1
22	{
23		private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
24		static Class1() {}
25		
26		// Relative path to the folder containing test files.
27		static string input_path =  "../../../../TestFiles/";
28		static string output_path = "../../../../TestFiles/Output/";
29
30		static string m_buf;
31
32		static public void ProcessPath(ElementReader reader, Element path)
33		{
34			if (path.IsClippingPath())
35			{
36				Console.WriteLine("This is a clipping path");
37			}
38
39			PathData pathData = path.GetPathData();
40			double[] data = pathData.points;
41			int data_sz = data.Length;
42
43			byte[] opr = pathData.operators;
44			int opr_sz = opr.Length;
45
46			int opr_itr = 0, opr_end = opr_sz;
47			int data_itr = 0, data_end = data_sz;
48			double x1, y1, x2, y2, x3, y3;
49
50			// Use path.GetCTM() if you are interested in CTM (current transformation matrix).
51
52			Console.Write(" Path Data Points := \"");
53			for ( ; opr_itr < opr_end; ++opr_itr)
54			{
55				switch((PathData.PathSegmentType)((int)opr[opr_itr]))
56				{
57					case PathData.PathSegmentType.e_moveto:
58						x1 = data[data_itr]; ++data_itr;
59						y1 = data[data_itr]; ++data_itr;
60						m_buf = string.Format("M{0:n0} {1:n0}", x1, y1);
61						Console.Write(m_buf);
62						break;
63					case PathData.PathSegmentType.e_lineto:
64						x1 = data[data_itr]; ++data_itr;
65						y1 = data[data_itr]; ++data_itr;
66						m_buf = string.Format(" L{0:n0} {1:n0}", x1, y1);
67						Console.Write(m_buf);
68						break;
69					case PathData.PathSegmentType.e_cubicto:
70						x1 = data[data_itr]; ++data_itr;
71						y1 = data[data_itr]; ++data_itr;
72						x2 = data[data_itr]; ++data_itr;
73						y2 = data[data_itr]; ++data_itr;
74						x3 = data[data_itr]; ++data_itr;
75						y3 = data[data_itr]; ++data_itr;
76						m_buf = string.Format(" C{0:n0} {1:n0} {2:n0} {3:n0} {4:n0} {5:n0}",
77							new object[] {x1, y1, x2, y2, x3, y3});
78						Console.Write(m_buf);
79						break;
80					case PathData.PathSegmentType.e_rect:
81					{
82						x1 = data[data_itr]; ++data_itr;
83						y1 = data[data_itr]; ++data_itr;
84						double w = data[data_itr]; ++data_itr;
85						double h = data[data_itr]; ++data_itr;
86						x2 = x1 + w;
87						y2 = y1;
88						x3 = x2;
89						y3 = y1 + h;
90						double x4 = x1; 
91						double y4 = y3;
92						m_buf = string.Format("M{0:n0} {1:n0} L{2:n0} {3:n0} L{4:n0} {5:n0} L{6:n0} {7:n0} Z",
93							new object[] {x1, y1, x2, y2, x3, y3, x4, y4});
94						Console.Write(m_buf);
95						break;
96					}
97					case PathData.PathSegmentType.e_closepath:
98						Console.WriteLine(" Close Path");
99						break;
100					default: 
101						System.Diagnostics.Debug.Assert(false);
102						break;
103				}	
104			}
105
106			Console.Write("\" ");
107
108			GState gs = path.GetGState();
109
110			// Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
111			if (path.IsStroked()) 
112			{
113				Console.WriteLine("Stroke path"); 
114
115				if (gs.GetStrokeColorSpace().GetType() == ColorSpace.Type.e_pattern)
116				{
117					Console.WriteLine("Path has associated pattern"); 
118				}
119				else
120				{
121					// Get stroke color (you can use PDFNet color conversion facilities)
122					// ColorPt rgb = new ColorPt();
123					// gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
124				}
125			}
126			else 
127			{
128				// Do not stroke path
129			}
130
131			if (path.IsFilled())
132			{
133				Console.WriteLine("Fill path"); 
134
135				if (gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern)
136				{		
137					Console.WriteLine("Path has associated pattern"); 
138				}
139				else
140				{
141					// ColorPt rgb = new ColorPt();
142					// gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
143				}        
144			}
145			else 
146			{
147				// Do not fill path
148			}
149
150			// Process any changes in graphics state  ---------------------------------
151
152			GSChangesIterator gs_itr = reader.GetChangesIterator();
153			for ( ; gs_itr.HasNext(); gs_itr.Next()) 
154			{
155				switch(gs_itr.Current())
156				{
157					case GState.GStateAttribute.e_transform :
158						// Get transform matrix for this element. Unlike path.GetCTM() 
159						// that return full transformation matrix gs.GetTransform() return 
160						// only the transformation matrix that was installed for this element.
161						//
162						// gs.GetTransform();
163						break;
164					case GState.GStateAttribute.e_line_width :
165						// gs.GetLineWidth();
166						break;
167					case GState.GStateAttribute.e_line_cap :
168						// gs.GetLineCap();
169						break;
170					case GState.GStateAttribute.e_line_join :
171						// gs.GetLineJoin();
172						break;
173					case GState.GStateAttribute.e_flatness :	
174						break;
175					case GState.GStateAttribute.e_miter_limit :
176						// gs.GetMiterLimit();
177						break;
178					case GState.GStateAttribute.e_dash_pattern :
179					{
180						// double[] dashes;
181						// gs.GetDashes(dashes);
182						// gs.GetPhase()
183						break;
184					}
185					case GState.GStateAttribute.e_fill_color:
186					{
187						if ( gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern &&
188							 gs.GetFillPattern().GetType() != PatternColor.Type.e_shading)
189						{	
190							//process the pattern data
191							reader.PatternBegin(true);
192							ProcessElements(reader);
193							reader.End();
194						}
195						break;
196					}
197				}
198			}
199			reader.ClearChangeList();
200		}
201
202		static public void ProcessText(ElementReader page_reader) 
203		{
204			// Begin text element
205			Console.WriteLine("Begin Text Block:");
206
207			Element element; 
208			while ((element = page_reader.Next()) != null) 
209			{
210				switch (element.GetType())
211				{
212					case Element.Type.e_text_end: 
213						// Finish the text block
214						Console.WriteLine("End Text Block.");
215						return;
216
217					case Element.Type.e_text:
218					{
219						GState gs = element.GetGState();
220
221						ColorSpace cs_fill = gs.GetFillColorSpace();
222						ColorPt fill = gs.GetFillColor();
223
224						ColorPt outc = new ColorPt();
225						cs_fill.Convert2RGB(fill, outc);
226
227
228						ColorSpace cs_stroke = gs.GetStrokeColorSpace();
229						ColorPt stroke = gs.GetStrokeColor();
230
231						Font font = gs.GetFont();
232
233						Console.Write("Font Name: ");
234						Console.WriteLine(font.GetName());
235						// font.IsFixedWidth();
236						// font.IsSerif();
237						// font.IsSymbolic();
238						// font.IsItalic();
239						// ... 
240
241						// double word_spacing = gs.GetWordSpacing();
242						// double char_spacing = gs.GetCharSpacing();
243
244						// Use element.GetCTM() if you are interested in the CTM 
245						// (current transformation matrix).
246						if (font.GetType() == Font.Type.e_Type3)
247						{
248							//type 3 font, process its data
249							for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
250							{
251								page_reader.Type3FontBegin(itr.Current());
252								ProcessElements(page_reader);
253								page_reader.End();
254							}
255						}
256
257						else
258						{
259
260							Matrix2D ctm = element.GetCTM();
261
262							Matrix2D text_mtx = element.GetTextMatrix();
263
264							/*
265							Matrix2D mtx = ctm * text_mtx;
266							double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
267							double font_size = gs.GetFontSize();
268							Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size);
269
270							ColorPt font_color = gs.GetFillColor();
271							ColorSpace cs = gs.GetFillColorSpace();
272
273							ColorPt rgb = new ColorPt();
274							cs.Convert2RGB(font_color, rgb);
275							Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
276							(byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
277								
278
279							Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", 
280							(byte)(rgb.Get(0)*255),
281							(byte)(rgb.Get(1)*255),
282							(byte)(rgb.Get(2)*255));
283							*/
284
285							double x, y;
286							int char_code; 
287											
288							for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
289							{
290								Console.Write("Character code: ");
291								char_code = itr.Current().char_code;
292								if (char_code >= 32 || char_code <= 127)
293								{ 
294									// Print if in ASCII range...
295									Console.Write((char)char_code);
296								}
297
298								x = itr.Current().x;		// character positioning information
299								y = itr.Current().y;
300
301								// To get the exact character positioning information you need to 
302								// concatenate current text matrix with CTM and then multiply 
303								// relative positioning coordinates with the resulting matrix.
304								//
305								Matrix2D mtx2 = ctm * text_mtx;
306								mtx2.Mult(ref x, ref y);
307								// Console.WriteLine(" Position: x={0:f} y={1:f}", x, y);
308							}
309						}
310
311						Console.WriteLine();
312						break;
313					}
314				}
315			}
316		}
317
318		static int image_counter = 0;
319
320		static public void ProcessImage(Element image)  
321		{
322			bool image_mask = image.IsImageMask();
323			bool interpolate = image.IsImageInterpolate();
324			int width = image.GetImageWidth();
325			int height = image.GetImageHeight();
326			int out_data_sz = width * height * 3;
327
328			Console.WriteLine("Image: width=\"{0:d}\" height=\"{1:d}\"", width, height);
329
330			// Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)
331
332			++image_counter;
333			System.Drawing.Bitmap bmp = image.GetBitmap();
334			bmp.Save(output_path + "reader_img_extract_" + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
335
336			// Alternatively you can use GetImageData to read the raw (decoded) image data
337			// image.GetBitsPerComponent();	
338			// image.GetImageData();	// get raw image data
339			// another approach is to use Image2RGB filter that converts every image to 
340			// RGB format. This could save you time since you don't need to deal with color 
341			// conversions, image up-sampling, decoding etc.
342			// ----------------
343			//   Image2RGB img_conv = new Image2RGB(image);	// Extract and convert image to RGB 8-bpc format
344			//   FilterReader reader = new FilterReader(img_conv);			//   
345			//   byte[] image_data_out = new byte[out_data_sz];  // A buffer used to keep image data.
346			//   reader.Read(image_data_out);  // image_data_out contains RGB image data.
347			// ----------------
348			// Note that you don't need to read a whole image at a time. Alternatively
349			// you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
350			// until the function returns 0. 
351		}
352
353	static void ProcessElements(ElementReader reader) 
354	{
355		Element element;
356
357		while ((element = reader.Next()) != null)  // Read page contents
358		{
359			switch (element.GetType())
360			{
361				case Element.Type.e_path:          // Process path data...
362				{
363					ProcessPath(reader, element);
364					break; 
365				}
366				case Element.Type.e_text_begin:    // Process text strings...
367				{
368					ProcessText(reader);
369					break;
370				}
371				case Element.Type.e_form:          // Process form XObjects
372				{
373					reader.FormBegin(); 
374					ProcessElements(reader);
375					reader.End(); 
376					break; 
377				}
378				case Element.Type.e_image:         // Process Images
379				{
380					ProcessImage(element);
381					break; 
382				}	
383			}
384		}
385	}
386
387		/// <summary>
388		/// The main entry point for the application.
389		/// </summary>
390		[STAThread]
391		static void Main(string[] args)
392		{
393			try
394			{
395				PDFNet.Initialize(PDFTronLicense.Key);
396
397				Console.WriteLine("-------------------------------------------------");
398				Console.WriteLine("Extract page element information from all ");
399				Console.WriteLine("pages in the document.");
400
401				// Open the test file
402				using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
403				{
404					doc.InitSecurityHandler();
405
406					int pgnum = doc.GetPageCount();
407					PageIterator itr;
408
409					using (ElementReader page_reader = new ElementReader())
410					{
411						for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next())		//  Read every page
412						{				
413							Console.WriteLine("Page {0:d}----------------------------------------",
414								itr.GetPageNumber());
415
416							Rect crop_box = itr.Current().GetCropBox();
417							crop_box.Normalize();
418
419							// Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2);
420							// Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height());
421
422							page_reader.Begin(itr.Current());
423							ProcessElements(page_reader);
424							page_reader.End(); 
425						}
426					}
427
428					Console.WriteLine("Done.");
429				}
430			}
431			catch (PDFNetException e)
432			{
433				Console.WriteLine(e.Message);
434			}
435			PDFNet.Terminate();
436		}
437	}
438}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Element.h>
10#include <PDF/Font.h>
11#include <Filters/FilterReader.h>
12#include <PDF/Image/Image2RGB.h>
13
14#include <iostream>
15#include <assert.h>
16#include "../../LicenseKey/CPP/LicenseKey.h"
17
18using namespace std;
19
20using namespace pdftron;
21using namespace PDF;
22using namespace SDF;
23using namespace Common;
24using namespace Filters; 
25
26char m_buf[4000];
27
28void ProcessElements(ElementReader& reader);
29
30void ProcessPath(ElementReader& reader, Element path)
31{
32	if (path.IsClippingPath())
33	{
34		cout << "This is a clipping path" << endl;
35	}
36
37	PathData d = path.GetPathData();
38
39	const UChar* opr = &d.GetOperators().front();
40	const UChar *opr_itr = opr, *opr_end = opr + d.GetOperators().size();
41	const double* data = &d.GetPoints().front();
42	const double *data_itr = data, *data_end = data + d.GetPoints().size();
43
44	double x1, y1, x2, y2, x3, y3;
45
46	// Use path.GetCTM() if you are interested in CTM (current transformation matrix).
47
48	cout << " Path Data Points := \"";
49	for (; opr_itr<opr_end; ++opr_itr)
50	{
51		switch(*opr_itr)
52		{
53		case PathData::e_moveto:
54			x1 = *data_itr; ++data_itr;
55			y1 = *data_itr; ++data_itr;
56			sprintf(m_buf, "M%.0f %.0f", x1, y1);
57			cout << m_buf;
58			break;
59		case PathData::e_lineto:
60			x1 = *data_itr; ++data_itr;
61			y1 = *data_itr; ++data_itr;
62			sprintf(m_buf, " L%.0f %.0f", x1, y1);
63			cout << m_buf;
64			break;
65		case PathData::e_cubicto:
66			x1 = *data_itr; ++data_itr;
67			y1 = *data_itr; ++data_itr;
68			x2 = *data_itr; ++data_itr;
69			y2 = *data_itr; ++data_itr;
70			x3 = *data_itr; ++data_itr;
71			y3 = *data_itr; ++data_itr;
72			sprintf(m_buf, " C%.0f %.0f %.0f %.0f %.0f %.0f", x1, y1, x2, y2, x3, y3);
73			cout << m_buf;
74			break;
75		case PathData::e_rect:
76			{
77				x1 = *data_itr; ++data_itr;
78				y1 = *data_itr; ++data_itr;
79				double w = *data_itr; ++data_itr;
80				double h = *data_itr; ++data_itr;
81				x2 = x1 + w;
82				y2 = y1;
83				x3 = x2;
84				y3 = y1 + h;
85				double x4 = x1; 
86				double y4 = y3;
87				sprintf(m_buf, "M%.0f %.0f L%.0f %.0f L%.0f %.0f L%.0f %.0f Z", 
88					x1, y1, x2, y2, x3, y3, x4, y4);
89				cout << m_buf;
90			}
91			break;
92		case PathData::e_closepath:
93			cout << " Close Path" << endl;
94			break;
95		default: 
96			assert(false);
97			break;
98		}	
99	}
100
101	cout << "\" ";
102
103	GState gs = path.GetGState();
104
105	// Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
106	if (path.IsStroked()) 
107	{
108		cout << "Stroke path" << endl; 
109
110		if (gs.GetStrokeColorSpace().GetType() == ColorSpace::e_pattern)
111		{
112			cout << "Path has associated pattern" << endl; 
113		}
114		else
115		{
116			// Get stroke color (you can use PDFNet color conversion facilities)
117			// ColorPt rgb;
118			// gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
119		}
120	}
121	else 
122	{
123		// Do not stroke path
124	}
125
126	if (path.IsFilled())
127	{
128		cout << "Fill path" << endl; 
129
130		if (gs.GetFillColorSpace().GetType() == ColorSpace::e_pattern)
131		{		
132			cout << "Path has associated pattern" << endl; 
133		}
134		else
135		{
136			// ColorPt rgb;
137			// gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
138		}        
139	}
140	else 
141	{
142		// Do not fill path
143	}
144
145	// Process any changes in graphics state  ---------------------------------
146
147	GSChangesIterator gs_itr = reader.GetChangesIterator();
148	for (; gs_itr.HasNext(); gs_itr.Next()) 
149	{
150		switch(gs_itr.Current())
151		{
152		case GState::e_transform :
153			// Get transform matrix for this element. Unlike path.GetCTM() 
154			// that return full transformation matrix gs.GetTransform() return 
155			// only the transformation matrix that was installed for this element.
156			//
157			// gs.GetTransform();
158			break;
159		case GState::e_line_width :
160			// gs.GetLineWidth();
161			break;
162		case GState::e_line_cap :
163			// gs.GetLineCap();
164			break;
165		case GState::e_line_join :
166			// gs.GetLineJoin();
167			break;
168		case GState::e_flatness :	
169			break;
170		case GState::e_miter_limit :
171			// gs.GetMiterLimit();
172			break;
173		case GState::e_dash_pattern :
174			{
175				// std::vector<double> dashes;
176				// gs.GetDashes(dashes);
177				// gs.GetPhase()
178			}
179			break;
180		case GState::e_fill_color:
181			{
182				if ( gs.GetFillColorSpace().GetType() == ColorSpace::e_pattern &&
183					gs.GetFillPattern().GetType() != PatternColor::e_shading )
184				{	
185					//process the pattern data
186					reader.PatternBegin(true);
187					ProcessElements(reader);
188					reader.End();
189				}
190			}
191			break;
192		}
193	}
194	reader.ClearChangeList();
195}
196
197void ProcessText(ElementReader& page_reader) 
198{
199	// Begin text element
200	cout << "Begin Text Block:" << endl;
201
202	Element element; 
203	while ((element = page_reader.Next()) != 0) 
204	{
205		switch (element.GetType())
206		{
207		case Element::e_text_end: 
208			// Finish the text block
209			cout << "End Text Block." << endl;
210			return;
211
212		case Element::e_text:
213			{
214				GState gs =  element.GetGState();
215
216				ColorSpace cs_fill = gs.GetFillColorSpace();
217				ColorPt fill = gs.GetFillColor();
218
219				ColorPt out;
220				cs_fill.Convert2RGB(fill, out);
221
222
223				ColorSpace cs_stroke = gs.GetStrokeColorSpace();
224				ColorPt stroke = gs.GetStrokeColor();
225
226				Font font = gs.GetFont();
227
228				cout << "Font Name: " << font.GetName() << endl;
229				// font.IsFixedWidth();
230				// font.IsSerif();
231				// font.IsSymbolic();
232				// font.IsItalic();
233				// ... 
234
235				// double font_size = gs.GetFontSize();
236				// double word_spacing = gs.GetWordSpacing();
237				// double char_spacing = gs.GetCharSpacing();
238				// const UString* txt = element.GetTextString();
239
240				if ( font.GetType() == Font::e_Type3 )
241				{
242					//type 3 font, process its data
243					for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
244					{
245						page_reader.Type3FontBegin(itr.Current());
246						ProcessElements(page_reader);
247						page_reader.End();
248					}
249				}
250
251				else
252				{	
253					Matrix2D text_mtx = element.GetTextMatrix();
254					double x, y;
255					unsigned int char_code;
256
257					for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next()) 
258					{
259						cout << "Character code: ";
260						char_code = itr.Current().char_code;
261						if (char_code>=32 || char_code<=127)
262						{ 
263							// Print if in ASCII range...
264							cout << char(char_code);
265						}
266
267						x = itr.Current().x;		// character positioning information
268						y = itr.Current().y;
269
270						// Use element.GetCTM() if you are interested in the CTM 
271						// (current transformation matrix).
272						Matrix2D ctm = element.GetCTM();
273
274						// To get the exact character positioning information you need to 
275						// concatenate current text matrix with CTM and then multiply 
276						// relative positioning coordinates with the resulting matrix.
277						Matrix2D mtx = ctm * text_mtx;
278						mtx.Mult(x, y);
279
280						// Get glyph path...
281						//vector<UChar> oprs;
282						//vector<double> glyph_data;
283						//font.GetGlyphPath(char_code, oprs, glyph_data, false, 0);
284					}
285				}
286
287				cout << endl;
288			}
289			break;
290		}
291	}
292}
293
294void ProcessImage(Element image)  
295{
296	bool image_mask = image.IsImageMask();
297	bool interpolate = image.IsImageInterpolate();
298	int width = image.GetImageWidth();
299	int height = image.GetImageHeight();
300	int out_data_sz = width * height * 3;
301
302	cout << "Image:" 
303		<< " width=\"" << width << "\""
304		<< " height=\"" << height << "\"" << endl;
305
306	// Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
307
308	// You can use GetImageData to read the raw (decoded) image data
309	//image->GetBitsPerComponent();	
310	//image->GetImageData();	// get raw image data
311	// .... or use Image2RGB filter that converts every image to RGB format,
312	// This should save you time since you don't need to deal with color conversions, 
313	// image up-sampling, decoding etc.
314
315	Image2RGB img_conv(image);	// Extract and convert image to RGB 8-bpc format
316	FilterReader reader(img_conv);
317
318	// A buffer used to keep image data.
319	std::vector<UChar> image_data_out; 
320	image_data_out.resize(out_data_sz);
321
322	reader.Read(&image_data_out.front(), out_data_sz);
323	// &image_data_out.front() contains RGB image data.
324
325	// Note that you don't need to read a whole image at a time. Alternatively
326	// you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
327	// until the function returns 0. 
328}
329
330void ProcessElements(ElementReader& reader) 
331{
332	Element element;
333	while ((element = reader.Next()) != 0) 	// Read page contents
334	{
335		switch (element.GetType())
336		{
337		case Element::e_path:						// Process path data...
338			{
339				ProcessPath(reader, element);
340			}
341			break; 
342		case Element::e_text_begin: 				// Process text block...
343			{
344				ProcessText(reader);
345			}
346			break;
347		case Element::e_form:						// Process form XObjects
348			{
349				reader.FormBegin(); 
350				ProcessElements(reader);
351				reader.End();
352			}
353			break; 
354		case Element::e_image:						// Process Images
355			{
356				ProcessImage(element);
357			}	
358			break; 
359		}
360	}
361}
362
363int main(int argc, char *argv[])
364{
365	int ret = 0;
366	PDFNet::Initialize(LicenseKey);
367
368	// Relative path to the folder containing test files.
369	string input_path =  "../../TestFiles/";
370	// string output_path = "../../TestFiles/Output/";
371
372
373	try	// Extract text data from all pages in the document
374	{
375		cout << "-------------------------------------------------" << endl;
376		cout << "Extract page element information from all " << endl;
377		cout << "pages in the document." << endl;
378
379		PDFDoc doc((input_path + "newsletter.pdf").c_str());
380		doc.InitSecurityHandler();
381
382		int pgnum = doc.GetPageCount();
383		PageIterator page_begin = doc.GetPageIterator();
384
385		ElementReader page_reader;
386
387		PageIterator itr;
388		for (itr = page_begin; itr.HasNext(); itr.Next())		//  Read every page
389		{				
390			cout << "Page " << itr.Current().GetIndex() << "----------------------------------------" << endl;
391			page_reader.Begin(itr.Current());
392			ProcessElements(page_reader);
393			page_reader.End();
394		}
395
396		cout << "Done." << endl;
397	}
398	catch(Exception& e)
399	{
400		cout << e << endl;
401		ret = 1;
402	}
403	catch(...)
404	{
405		cout << "Unknown Exception" << endl;
406		ret = 1;
407	}
408
409	PDFNet::Terminate();
410	return ret;
411}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8	"fmt"
9    "os"
10    "strconv"
11	. "pdftron"
12)
13
14import  "pdftron/Samples/LicenseKey/GO"
15
16func ProcessPath(reader ElementReader, path Element){
17    if path.IsClippingPath(){
18        fmt.Println("This is a clipping path")
19    }
20
21    pathData := path.GetPathData()
22    data := pathData.GetPoints()
23    opr := pathData.GetOperators()
24
25    oprIndex := 0
26    oprEnd := int(opr.Size())
27    dataIndex := 0
28    //dataEnd := data.Size()
29    
30    // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
31    
32    os.Stdout.Write([]byte("Path Data Points := \""))
33    x1, x2, x3, x4 := 0.0, 0.0, 0.0, 0.0
34    y1, y2, y3, y4 := 0.0, 0.0, 0.0, 0.0
35    for oprIndex < oprEnd{
36        if int(opr.Get(oprIndex)) == int(PathDataE_moveto){
37            x1 = data.Get(dataIndex) 
38            dataIndex = dataIndex + 1
39            y1 = data.Get(dataIndex)
40            dataIndex = dataIndex + 1
41            os.Stdout.Write([]byte("M" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
42        }else if int(opr.Get(oprIndex)) == int(PathDataE_lineto){
43            x1 = data.Get(dataIndex) 
44            dataIndex = dataIndex + 1
45            y1 = data.Get(dataIndex)
46            dataIndex = dataIndex + 1
47            os.Stdout.Write([]byte(" L" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
48        }else if int(opr.Get(oprIndex)) == int(PathDataE_cubicto){
49            x1 = data.Get(dataIndex)
50            dataIndex = dataIndex + 1
51            y1 = data.Get(dataIndex)
52            dataIndex = dataIndex + 1
53            x2 = data.Get(dataIndex)
54            dataIndex = dataIndex + 1
55            y2 = data.Get(dataIndex)
56            dataIndex = dataIndex + 1
57            x3 = data.Get(dataIndex)
58            dataIndex = dataIndex + 1
59            y3 = data.Get(dataIndex)
60            dataIndex = dataIndex + 1
61            os.Stdout.Write([]byte(" C" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1) + " " + fmt.Sprintf("%f", x2) + " " + fmt.Sprintf("%f", y2) + " " + fmt.Sprintf("%f", x3) + " " + fmt.Sprintf("%f", y3)))
62        }else if int(opr.Get(oprIndex)) == int(PathDataE_rect){
63            x1 = data.Get(dataIndex)
64            dataIndex = dataIndex + 1
65            y1 = data.Get(dataIndex)
66            dataIndex = dataIndex + 1
67            w := data.Get(dataIndex)
68            dataIndex = dataIndex + 1
69            h := data.Get(dataIndex)
70            dataIndex = dataIndex + 1
71            x2 = x1 + w
72            y2 = y1
73            x3 = x2
74            y3 = y1 + h
75            x4 = x1
76            y4 = y3
77            os.Stdout.Write([]byte("M" + fmt.Sprintf("%.2f", x1) + " " + fmt.Sprintf("%.2f", y1) + " L" + fmt.Sprintf("%.2f", x2) + " " + fmt.Sprintf("%.2f", y2) + " L" + fmt.Sprintf("%.2f", x3) + " " + fmt.Sprintf("%.2f", y3) + " L" + fmt.Sprintf("%.2f", x4) + " " + fmt.Sprintf("%.2f", y4) + " Z"))
78        }else if int(opr.Get(oprIndex)) == int(PathDataE_closepath){
79            fmt.Println(" Close Path")
80        }else{
81            //
82        }
83        oprIndex = oprIndex + 1
84    }
85
86    os.Stdout.Write([]byte("\" "))
87    gs := path.GetGState()
88    
89    // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
90    if path.IsStroked(){
91        fmt.Println("Stroke path")
92        
93        if (gs.GetStrokeColorSpace().GetType() == ColorSpaceE_pattern){
94            fmt.Println("Path has associated pattern")
95        }else{
96            // Get stroke color (you can use PDFNet color conversion facilities)
97            // rgb = gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor())
98        }
99    }else{
100        // Do not stroke path
101    }
102
103    if path.IsFilled(){
104        fmt.Println("Fill path")
105        
106        if (gs.GetFillColorSpace().GetType() == ColorSpaceE_pattern){
107            fmt.Println("Path has associated pattern")
108        }else{
109            // rgb = gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor())
110        }
111    }else{
112        // Do not fill path
113    }
114
115    // Process any changes in graphics state  ---------------------------------
116    gsItr := reader.GetChangesIterator()
117    for gsItr.HasNext(){
118        if int(gsItr.Current()) == int(GStateE_transform){
119            // Get transform matrix for this element. Unlike path.GetCTM() 
120            // that return full transformation matrix gs.GetTransform() return 
121            // only the transformation matrix that was installed for this element.
122            //
123            // gs.GetTransform()
124            
125        }else if int(gsItr.Current()) == int(GStateE_line_width){
126            // gs.GetLineWidth()
127            
128        }else if int(gsItr.Current()) == int(GStateE_line_cap){
129            // gs.GetLineCap()
130            
131        }else if int(gsItr.Current()) == int(GStateE_line_join){
132            // gs.GetLineJoin()
133            
134        }else if int(gsItr.Current()) == int(GStateE_flatness){
135            
136        }else if int(gsItr.Current()) == int(GStateE_miter_limit){
137            // gs.GetMiterLimit()
138            
139        }else if int(gsItr.Current()) == int(GStateE_dash_pattern){
140            // dashes = gs.GetDashes()
141            // gs.GetPhase()
142            
143        }else if int(gsItr.Current()) == int(GStateE_fill_color){
144            if (int(gs.GetFillColorSpace().GetType()) == int(ColorSpaceE_pattern) && int(gs.GetFillPattern().GetType()) != int(PatternColorE_shading) ){
145                // process the pattern data
146                reader.PatternBegin(true)
147                ProcessElements(reader)
148                reader.End()
149            }
150        }
151        gsItr.Next()
152    }
153    reader.ClearChangeList()
154}
155
156func ProcessText (pageReader ElementReader){
157    // Begin text element
158    fmt.Println("Begin Text Block:")
159    
160    element := pageReader.Next()
161    
162    for element.GetMp_elem().Swigcptr() != 0{
163        etype := element.GetType()
164        if etype == ElementE_text_end{
165            // Finish the text block
166            fmt.Println("End Text Block.")
167            return
168        }else if etype == ElementE_text{
169            gs := element.GetGState()
170            
171            //csFill := gs.GetFillColorSpace()
172            //fill := gs.GetFillColor()
173            
174            //out := csFill.Convert2RGB(fill)
175            
176            //csStroke := gs.GetStrokeColorSpace()
177            //stroke := gs.GetStrokeColor()
178            
179            font := gs.GetFont()
180            fmt.Println("Font Name: " + font.GetName())
181            // font.IsFixedWidth()
182            // font.IsSerif()
183            // font.IsSymbolic()
184            // font.IsItalic()
185            // ... 
186
187            // fontSize = gs.GetFontSize()
188            // wordSpacing = gs.GetWordSpacing()
189            // charSpacing = gs.GetCharSpacing()
190            // txt := element.GetTextString()
191            if font.GetType() == FontE_Type3{
192                // type 3 font, process its data
193                itr := element.GetCharIterator()
194                for itr.HasNext(){
195                    pageReader.Type3FontBegin(itr.Current())
196                    ProcessElements(pageReader)
197                    pageReader.End()
198                }
199            }else{
200                text_mtx := element.GetTextMatrix()
201                
202                itr := element.GetCharIterator()
203                for itr.HasNext(){
204                    charCode := itr.Current().GetChar_data()
205                    if *charCode >= 32 && *charCode <= 255 {     // Print if in ASCII range...
206                        a := font.MapToUnicode(uint(*charCode))
207                        os.Stdout.Write([]byte( a )) // Revisit: if sys.version_info.major < 3 else ascii(a[0]) ))
208                    }    
209                    pt := NewPoint()   
210                    pt.SetX(itr.Current().GetX())     // character positioning information
211                    pt.SetY(itr.Current().GetY())
212                    
213                    // Use element.GetCTM() if you are interested in the CTM 
214                    // (current transformation matrix).
215                    ctm := element.GetCTM()
216                    
217                    // To get the exact character positioning information you need to 
218                    // concatenate current text matrix with CTM and then multiply 
219                    // relative positioning coordinates with the resulting matrix.
220                    mtx := ctm.Multiply(text_mtx)
221                    mtx.Mult(pt)
222                    itr.Next()
223                }
224            }
225            fmt.Println("")
226        }
227        element = pageReader.Next()
228    }
229}
230
231func ProcessImage (image Element){
232    //imageMask := image.IsImageMask()
233    //interpolate := image.IsImageInterpolate()
234    width := image.GetImageWidth()
235    height := image.GetImageHeight()
236    outDataSz := width * height * 3
237    
238    fmt.Println("Image: width=\"" + fmt.Sprintf("%d", width) + "\"" + " height=\"" + fmt.Sprintf("%d", height)+ "\"" )
239    
240    // Matrix2D& mtx = image->GetCTM() // image matrix (page positioning info)
241
242    // You can use GetImageData to read the raw (decoded) image data
243    //image->GetBitsPerComponent()    
244    //image->GetImageData()    // get raw image data
245    // .... or use Image2RGB filter that converts every image to RGB format,
246    // This should save you time since you don't need to deal with color conversions, 
247    // image up-sampling, decoding etc.
248    
249    imgConv := NewImage2RGB(image)     // Extract and convert image to RGB 8-bps format
250    reader := NewFilterReader(imgConv)
251
252    //imageDataOut := reader.Read(int64(outDataSz))
253    reader.Read(int64(outDataSz))
254    
255    // Note that you don't need to read a whole image at a time. Alternatively
256    // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
257    // until the function returns 0. 
258}
259
260func ProcessElements(reader ElementReader){
261    element := reader.Next()     // Read page contents
262    for element.GetMp_elem().Swigcptr() != 0{
263        etype := element.GetType()
264        if etype == ElementE_path{      // Process path data...
265            ProcessPath(reader, element)
266        }else if etype == ElementE_text_begin{      // Process text block...
267            ProcessText(reader)
268        }else if etype == ElementE_form{    // Process form XObjects
269            reader.FormBegin()
270            ProcessElements(reader)
271            reader.End()
272        }else if etype == ElementE_image{    // Process Images
273            ProcessImage(element)
274        }
275        element = reader.Next()
276    }
277}
278
279func main(){
280    PDFNetInitialize(PDFTronLicense.Key)
281    
282    // Relative path to the folder containing the test files.
283    inputPath := "../../TestFiles/"
284    //outputPath := "../../TestFiles/Output/"
285    
286    // Extract text data from all pages in the document
287    
288    fmt.Println("__________________________________________________")
289    fmt.Println("Extract page element information from all ")
290    fmt.Println("pages in the document.")
291    
292    doc := NewPDFDoc(inputPath + "newsletter.pdf")
293    doc.InitSecurityHandler()
294    //pgnum := doc.GetPageCount()
295    pageBegin := doc.GetPageIterator()
296    pageReader := NewElementReader()
297    
298    itr := pageBegin
299    for itr.HasNext(){    // Read every page
300        fmt.Println("Page " + strconv.Itoa(itr.Current().GetIndex()) + "----------------------------------------")
301        pageReader.Begin(itr.Current())
302        ProcessElements(pageReader)
303        pageReader.End()
304        itr.Next()
305    }
306    doc.Close()
307    PDFNetTerminate()
308    fmt.Println("Done.")
309}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.sdf.*;
7import com.pdftron.pdf.*;
8import com.pdftron.common.*;
9import com.pdftron.filters.FilterReader;
10
11
12public class ElementReaderAdvTest {
13
14    static String m_buf;
15
16    static void ProcessPath(ElementReader reader, Element path) throws PDFNetException {
17        if (path.isClippingPath()) {
18            System.out.println("This is a clipping path");
19        }
20
21        PathData pathData = path.getPathData();
22        double[] data = pathData.getPoints();
23        byte[] opr = pathData.getOperators();
24
25        double x1, y1, x2, y2, x3, y3;
26        // Use path.getCTM() if you are interested in CTM (current transformation matrix).
27
28        System.out.print(" Path Data Points := \"");
29        int data_index = 0;
30        for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
31            switch (opr[opr_index]) {
32                case PathData.e_moveto:
33                    x1 = data[data_index];
34                    ++data_index;
35                    y1 = data[data_index];
36                    ++data_index;
37                    System.out.print("M" + x1 + " " + y1);
38                    break;
39                case PathData.e_lineto:
40                    x1 = data[data_index];
41                    ++data_index;
42                    y1 = data[data_index];
43                    ++data_index;
44                    System.out.print(" L" + x1 + " " + y1);
45
46                    break;
47                case PathData.e_cubicto:
48                    x1 = data[data_index];
49                    ++data_index;
50                    y1 = data[data_index];
51                    ++data_index;
52                    x2 = data[data_index];
53                    ++data_index;
54                    y2 = data[data_index];
55                    ++data_index;
56                    x3 = data[data_index];
57                    ++data_index;
58                    y3 = data[data_index];
59                    ++data_index;
60                    System.out.print(" C" + x1 + " " + y1 + " " + x2 + " " + y2 + " " + x3 + " " + y3);
61                    break;
62                case PathData.e_rect: {
63                    x1 = data[data_index];
64                    ++data_index;
65                    y1 = data[data_index];
66                    ++data_index;
67                    double w = data[data_index];
68                    ++data_index;
69                    double h = data[data_index];
70                    ++data_index;
71                    x2 = x1 + w;
72                    y2 = y1;
73                    x3 = x2;
74                    y3 = y1 + h;
75                    double x4 = x1;
76                    double y4 = y3;
77                    System.out.print("M" + x1 + " " + y1 + " L" + x2 + " " + y2 + " L" + x3 + " " + y3 + " L" + x4 + " " + y4 + " Z");
78                }
79                break;
80                case PathData.e_closepath:
81                    System.out.println(" Close Path");
82                    break;
83                default:
84                    throw new PDFNetException("Invalid Element Type", 0, "", "", "");
85            }
86        }
87
88        System.out.print("\" ");
89
90        GState gs = path.getGState();
91
92        // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
93        if (path.isStroked()) {
94            System.out.println("Stroke path");
95
96            if (gs.getStrokeColorSpace().getType() == ColorSpace.e_pattern) {
97                System.out.println("Path has associated pattern");
98            } else {
99                // Get stroke color (you can use PDFNet color conversion facilities)
100                ColorPt rgb = new ColorPt();
101                rgb = gs.getStrokeColor();
102                double v = rgb.get(0);
103                rgb = gs.getStrokeColorSpace().convert2RGB(rgb);
104                v = rgb.get(0);
105            }
106        } else {
107            // Do not stroke path
108        }
109
110        if (path.isFilled()) {
111            System.out.println("Fill path");
112
113            if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern) {
114                System.out.println("Path has associated pattern");
115                PatternColor pat = gs.getFillPattern();
116                int type = pat.getType();
117                if (type == PatternColor.e_shading) {
118                    System.out.println("Shading");
119                    Shading shading = pat.getShading();
120                    if (shading.getType() == Shading.e_function_shading) {
121                        System.out.println("FUNCT");
122                    } else if (shading.getType() == Shading.e_axial_shading) {
123                        System.out.println("AXIAL");
124                    } else if (shading.getType() == Shading.e_radial_shading) {
125                        System.out.println("RADIAL");
126                    }
127                } else if (type == PatternColor.e_colored_tiling_pattern) {
128                    System.out.println("e_colored_tiling_pattern");
129                } else if (type == PatternColor.e_uncolored_tiling_pattern) {
130                    System.out.println("e_uncolored_tiling_pattern");
131                } else {
132                    System.out.println("?");
133                }
134            } else {
135                ColorPt rgb = new ColorPt();
136                rgb = gs.getFillColor();
137                double v = rgb.get(0);
138                rgb = gs.getFillColorSpace().convert2RGB(rgb);
139                v = rgb.get(0);
140            }
141        } else {
142            // Do not fill path
143        }
144
145        // Process any changes in graphics state  ---------------------------------
146
147        GSChangesIterator gs_itr = reader.getChangesIterator();
148        while (gs_itr.hasNext()) {
149            switch (gs_itr.next().intValue()) {
150                case GState.e_transform:
151                    // Get transform matrix for this element. Unlike path.GetCTM()
152                    // that return full transformation matrix gs.GetTransform() return
153                    // only the transformation matrix that was installed for this element.
154                    //
155                    //gs.getTransform();
156                    break;
157                case GState.e_line_width:
158                    //gs.getLineWidth();
159                    break;
160                case GState.e_line_cap:
161                    //gs.getLineCap();
162                    break;
163                case GState.e_line_join:
164                    //gs.getLineJoin();
165                    break;
166                case GState.e_flatness:
167                    break;
168                case GState.e_miter_limit:
169                    //gs.getMiterLimit();
170                    break;
171                case GState.e_dash_pattern: {
172                    //double[] dashes;
173                    //dashes=gs.getDashes();
174                    //gs.getPhase();
175                }
176                break;
177                case GState.e_fill_color: {
178                    if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern &&
179                            gs.getFillPattern().getType() != PatternColor.e_shading) {
180                        //process the pattern data
181                        reader.patternBegin(true);
182                        ProcessElements(reader);
183                        reader.end();
184                    }
185                }
186                break;
187            }
188        }
189        reader.clearChangeList();
190    }
191
192    static void ProcessText(ElementReader page_reader) throws PDFNetException {
193        // Begin text element
194        System.out.println("Begin Text Block:");
195
196        Element element;
197        while ((element = page_reader.next()) != null) {
198            switch (element.getType()) {
199                case Element.e_text_end:
200                    // Finish the text block
201                    System.out.println("End Text Block.");
202                    return;
203
204                case Element.e_text: {
205                    GState gs = element.getGState();
206
207                    ColorSpace cs_fill = gs.getFillColorSpace();
208                    ColorPt fill = gs.getFillColor();
209
210                    ColorPt out;
211                    out = cs_fill.convert2RGB(fill);
212
213
214                    ColorSpace cs_stroke = gs.getStrokeColorSpace();
215                    ColorPt stroke = gs.getStrokeColor();
216
217                    Font font = gs.getFont();
218
219                    System.out.println("Font Name: " + font.getName());
220                    //font.isFixedWidth();
221                    //font.isSerif();
222                    //font.isSymbolic();
223                    //font.isItalic();
224                    // ...
225
226                    //double font_size = gs.getFontSize();
227                    //double word_spacing = gs.getWordSpacing();
228                    //double char_spacing = gs.getCharSpacing();
229                    //String txt = element.getTextString();
230
231                    if (font.getType() == Font.e_Type3) {
232                        //type 3 font, process its data
233                        for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
234                            page_reader.type3FontBegin(itr.next(), null);
235                            ProcessElements(page_reader);
236                            page_reader.end();
237                        }
238                    } else {
239                        Matrix2D text_mtx = element.getTextMatrix();
240                        double x, y;
241                        long char_code;
242
243                        for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
244                            CharData data = itr.next();
245                            char_code = data.getCharCode();
246                            //System.out.print("Character code: ");
247
248                            System.out.print(String.valueOf(char_code));
249
250                            x = data.getGlyphX();        // character positioning information
251                            y = data.getGlyphY();
252
253                            // Use element.getCTM() if you are interested in the CTM
254                            // (current transformation matrix).
255                            Matrix2D ctm = element.getCTM();
256
257                            // To get the exact character positioning information you need to
258                            // concatenate current text matrix with CTM and then multiply
259                            // relative positioning coordinates with the resulting matrix.
260                            //
261                            Matrix2D mtx = ctm.multiply(text_mtx);
262                            java.awt.geom.Point2D.Double t = mtx.multPoint(x, y);
263                            x = t.x;
264                            y = t.y;
265                            //System.out.println(" Position: x=" + x + " y=" + y );
266                        }
267
268                        System.out.println();
269                    }
270                }
271                break;
272            }
273        }
274    }
275
276    static void ProcessImage(Element image) throws PDFNetException {
277        boolean image_mask = image.isImageMask();
278        boolean interpolate = image.isImageInterpolate();
279        int width = image.getImageWidth();
280        int height = image.getImageHeight();
281        int out_data_sz = width * height * 3;
282
283        System.out.println("Image: " +
284                " width=\"" + width + "\""
285                + " height=\"" + height);
286
287        // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
288
289        // You can use GetImageData to read the raw (decoded) image data
290        //image->GetBitsPerComponent();
291        //image->GetImageData();	// get raw image data
292        // .... or use Image2RGB filter that converts every image to RGB format,
293        // This should save you time since you don't need to deal with color conversions,
294        // image up-sampling, decoding etc.
295
296        Image2RGB img_conv = new Image2RGB(image);    // Extract and convert image to RGB 8-bpc format
297        FilterReader reader = new FilterReader(img_conv);
298
299        // A buffer used to keep image data.
300        byte[] buf = new byte[out_data_sz];
301        long image_data_out = reader.read(buf);
302        // &image_data_out.front() contains RGB image data.
303
304        // Note that you don't need to read a whole image at a time. Alternatively
305        // you can read a chunk at a time by repeatedly calling reader.Read(buf)
306        // until the function returns 0.
307    }
308
309    static void ProcessElements(ElementReader reader) throws PDFNetException {
310        Element element;
311        while ((element = reader.next()) != null)    // Read page contents
312        {
313            switch (element.getType()) {
314                case Element.e_path:                        // Process path data...
315                {
316                    ProcessPath(reader, element);
317                }
318                break;
319                case Element.e_text_begin:                // Process text block...
320                {
321                    ProcessText(reader);
322                }
323                break;
324                case Element.e_form:                        // Process form XObjects
325                {
326                    reader.formBegin();
327                    ProcessElements(reader);
328                    reader.end();
329                }
330                break;
331                case Element.e_image:                        // Process Images
332                {
333                    ProcessImage(element);
334                }
335                break;
336            }
337        }
338    }
339
340    public static void main(String[] args) {
341        PDFNet.initialize(PDFTronLicense.Key());
342
343        // Relative path to the folder containing test files.
344        String input_path = "../../TestFiles/";
345        // string output_path = "../../TestFiles/Output/";
346
347        System.out.println("__________________________________________________");
348        System.out.println("Extract page element information from all ");
349        System.out.println("pages in the document.");
350        try (PDFDoc doc = new PDFDoc((input_path + "newsletter.pdf")))    // Extract text data from all pages in the document
351        {
352            doc.initSecurityHandler();
353
354            int pgnum = doc.getPageCount();
355            PageIterator page_begin = doc.getPageIterator();
356
357            ElementReader page_reader = new ElementReader();
358
359            PageIterator itr;
360
361            for (itr = page_begin; itr.hasNext(); )        //  Read every page
362            {
363                Page nextPage = itr.next();
364                System.out.println("Page " + nextPage.getIndex() +
365                        "----------------------------------------");
366                page_reader.begin(nextPage);
367                ProcessElements(page_reader);
368                page_reader.end();
369            }
370            System.out.println("Done");
371        } catch (Exception e) {
372            System.out.println(e);
373        }
374
375        PDFNet.terminate();
376    }
377}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11
12  exports.runElementReaderAdvTest = () => {
13
14    const processPath = async (reader, path) => {
15      if (await path.isClippingPath()) {
16        console.log('This is a clipping path');
17      }
18
19      const d = await path.getPathData();
20
21      const opr = d.operators;
22      const opr_len = opr.byteLength;
23      const data = d.points;
24      let data_idx = 0, data_len = data.byteLength / data.BYTES_PER_ELEMENT;
25
26      let x1, y1, x2, y2, x3, y3;
27
28      // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
29
30      let path_str = ' Path Data Points := "';
31      for (let opr_idx = 0; opr_idx < opr_len; ++opr_idx) {
32        switch (opr[opr_idx]) {
33          case PDFNet.Element.PathSegmentType.e_moveto:
34            x1 = data[data_idx]; ++data_idx;
35            y1 = data[data_idx]; ++data_idx;
36            path_str += 'M' + Math.round(x1) + ' ' + Math.round(y1);
37            break;
38          case PDFNet.Element.PathSegmentType.e_lineto:
39            x1 = data[data_idx]; ++data_idx;
40            y1 = data[data_idx]; ++data_idx;
41            path_str += 'L' + Math.round(x1) + ' ' + Math.round(y1);
42            break;
43          case PDFNet.Element.PathSegmentType.e_cubicto:
44            x1 = data[data_idx]; ++data_idx;
45            y1 = data[data_idx]; ++data_idx;
46            x2 = data[data_idx]; ++data_idx;
47            y2 = data[data_idx]; ++data_idx;
48            x3 = data[data_idx]; ++data_idx;
49            y3 = data[data_idx]; ++data_idx;
50            path_str += 'C' + Math.round(x1) + ' ' + Math.round(y1) + ' ' + Math.round(x2)
51             + ' ' + Math.round(y2) + ' ' + Math.round(x3) + ' ' + Math.round(y3);
52            break;
53          case PDFNet.Element.PathSegmentType.e_rect:
54            x1 = data[data_idx]; ++data_idx;
55            y1 = data[data_idx]; ++data_idx;
56            const w = data[data_idx]; ++data_idx;
57            const h = data[data_idx]; ++data_idx;
58            x2 = x1 + w;
59            y2 = y1;
60            x3 = x2;
61            y3 = y1 + h;
62            const x4 = x1;
63            const y4 = y3;
64            path_str += 'M' + Math.round(x1) + ' ' + Math.round(y1) + ' L' + Math.round(x2) + ' ' + Math.round(y2)
65             + ' L' + Math.round(x3) + ' ' + Math.round(y3) + ' L' + Math.round(x4) + ' ' + Math.round(y4) + ' Z';
66            break;
67          case PDFNet.Element.PathSegmentType.e_closepath:
68            path_str += ' Close Path\n';
69            break;
70          default:
71            throw ''
72            break;
73        }
74      }
75
76      path_str += '" ';
77
78      const gs = await path.getGState();
79
80      // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
81      if (await path.isStroked()) {
82        console.log(path_str + 'Stroke path');
83        path_str = '';
84
85        if (await (await gs.getStrokeColorSpace()).getType() === PDFNet.ColorSpace.Type.e_pattern) {
86          console.log('Path has associated pattern');
87        } else {
88          // Get stroke color (you can use PDFNet color conversion facilities)
89          // ColorPt rgb;
90          // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
91        }
92      } else {
93        // Do not stroke path
94      }
95
96      if (await path.isFilled()) {
97        console.log(path_str + 'Fill path');
98        path_str = '';
99
100        if (await (await gs.getFillColorSpace()).getType() === PDFNet.ColorSpace.Type.e_pattern) {
101          console.log('Path has associated pattern');
102        } else {
103          // ColorPt rgb;
104          // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
105        }
106      } else {
107        // Do not fill path
108      }
109
110      if (path_str) {
111        console.log(path_str);
112      }
113
114      // Process any changes in graphics state  ---------------------------------
115
116      const gs_itr = await reader.getChangesIterator();
117      for (; await gs_itr.hasNext(); await gs_itr.next()) {
118        switch (await gs_itr.current()) {
119          case PDFNet.GState.Attribute.e_transform:
120            // Get transform matrix for this element. Unlike path.GetCTM() 
121            // that return full transformation matrix gs.GetTransform() return 
122            // only the transformation matrix that was installed for this element.
123            //
124            // gs.GetTransform();
125            break;
126          case PDFNet.GState.Attribute.e_line_width:
127            // gs.GetLineWidth();
128            break;
129          case PDFNet.GState.Attribute.e_line_cap:
130            // gs.GetLineCap();
131            break;
132          case PDFNet.GState.Attribute.e_line_join:
133            // gs.GetLineJoin();
134            break;
135          case PDFNet.GState.Attribute.e_flatness:
136            break;
137          case PDFNet.GState.Attribute.e_miter_limit:
138            // gs.GetMiterLimit();
139            break;
140          case PDFNet.GState.Attribute.e_dash_pattern:
141            {
142              // std::vector<double> dashes;
143              // gs.GetDashes(dashes);
144              // gs.GetPhase()
145            }
146            break;
147          case PDFNet.GState.Attribute.e_fill_color:
148            {
149              if (await (await gs.getFillColorSpace()).getType() === PDFNet.ColorSpace.Type.e_pattern &&
150                await (await gs.getFillPattern()).getType() !== PDFNet.PatternColor.Type.e_shading) {
151                //process the pattern data
152                await reader.patternBegin(true);
153                await processElements(reader);
154                await reader.end();
155              }
156            }
157            break;
158        }
159      }
160      await reader.clearChangeList();
161    };
162
163    const processText = async (pageReader) => {
164      // Begin text element
165      console.log('Begin Text Block:');
166
167      let element;
168      while (element = await pageReader.next()) {
169        switch (await element.getType()) {
170          case PDFNet.Element.Type.e_text_end:
171            // Finish the text block
172            console.log('End Text Block.');
173            return;
174
175          case PDFNet.Element.Type.e_text:
176            const gs = await element.getGState();
177
178            const cs_fill = await gs.getFillColorSpace();
179            const fill = await gs.getFillColor();
180
181            const out = await cs_fill.convert2RGB(fill);
182
183
184            const cs_stroke = await gs.getStrokeColorSpace();
185            const stroke = await gs.getStrokeColor();
186
187            const font = await gs.getFont();
188
189            console.log('Font Name: ' + await font.getName());
190
191            let outPutStr = '';
192            if (await font.getType() == PDFNet.Font.Type.e_Type3) {
193              //type 3 font, process its data
194              for (const itr = await element.getCharIterator(); await itr.hasNext(); await itr.next()) {
195                await pageReader.type3FontBegin(await itr.current());
196                await processElements(pageReader);
197                await pageReader.end();
198              }
199            } else {
200              const text_mtx = await element.getTextMatrix();
201
202              for (const itr = await element.getCharIterator(); await itr.hasNext(); await itr.next()) {
203                outPutStr += 'Character code: ';
204                const charData = await itr.current();
205                const charCode = charData.char_code;
206                if (charCode >= 32 || charCode <= 127) {
207                  // Print if in ASCII range...
208                  outPutStr += String.fromCharCode(charCode);
209                }
210
211                const x = charData.x;		// character positioning information
212                const y = charData.y;
213
214                // Use element.GetCTM() if you are interested in the CTM 
215                // (current transformation matrix).
216                const ctm = await element.getCTM();
217
218                // To get the exact character positioning information you need to 
219                // concatenate current text matrix with CTM and then multiply 
220                // relative positioning coordinates with the resulting matrix.
221                await ctm.multiply(text_mtx);
222                await ctm.mult(x, y);
223              }
224            }
225            console.log(outPutStr);
226            break;
227        }
228      }
229    };
230
231    const processImage = async (image) => {
232      const width = await image.getImageWidth();
233      const height = await image.getImageHeight();
234      const out_data_sz = await width * height * 3;
235
236      console.log('Image: width=\'' + width + '\' height=\'' + height + '\'');
237
238      const img_conv = await PDFNet.Filter.createImage2RGBFromElement(image);	// Extract and convert image to RGB 8-bpc format
239      const reader = await PDFNet.FilterReader.create(img_conv);
240
241      const image_data_out = await reader.read(out_data_sz);
242
243      // Note that you don't need to read a whole image at a time. Alternatively
244      // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
245      // until the function returns 0. 
246    }
247
248    const processElements = async (reader) => {
249      let element;
250      while (element = await reader.next()) {	// Read page contents
251        switch (await element.getType()) {
252          case PDFNet.Element.Type.e_path:						// Process path data...
253            await processPath(reader, element);
254            break;
255          case PDFNet.Element.Type.e_text_begin: 				// Process text block...
256            await processText(reader);
257            break;
258          case PDFNet.Element.Type.e_form:						// Process form XObjects
259            await reader.formBegin();
260            await processElements(reader);
261            await reader.end();
262            break;
263          case PDFNet.Element.Type.e_image:						// Process Images
264            await processImage(element);
265            break;
266        }
267      }
268    }
269
270    const main = async () => {
271      // Relative path to the folder containing test files.
272      const inputPath = '../TestFiles/';
273      try {
274        console.log('-------------------------------------------------');
275        console.log('Extract page element information from all ');
276        console.log('pages in the document.');
277
278        const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'newsletter.pdf');
279        doc.initSecurityHandler();
280
281        const pgnum = await doc.getPageCount();
282        const pageBegin = await doc.getPageIterator();
283
284        const pageReader = await PDFNet.ElementReader.create();
285
286        for (const itr = pageBegin; await itr.hasNext(); await itr.next())		//  Read every page
287        {
288          const curPage = await itr.current();
289          console.log('Page ' + await curPage.getIndex() + '----------------------------------------');
290          await pageReader.beginOnPage(curPage);
291          await processElements(pageReader);
292          await pageReader.end();
293        }
294
295        console.log('Done.');
296      } catch (err) {
297        console.log(err);
298      }
299    };
300    PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) { console.log('Error: ' + JSON.stringify(error)); }).then(function () { return PDFNet.shutdown(); });
301  };
302  exports.runElementReaderAdvTest();
303})(exports);
304// eslint-disable-next-line spaced-comment
305//# sourceURL=ElementReaderAdvTest.js

1<?php
2#---------------------------------------------------------------------------------------
3# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4# Consult LICENSE.txt regarding license information.
5#---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10function ProcessPath($reader, $path)
11{
12	if ($path->IsClippingPath())
13	{
14		echo nl2br("This is a clipping path\n");
15	}
16
17	$pathData = $path->GetPathData();
18	$data = $pathData->GetPoints();
19	$opr = $pathData->GetOperators();
20
21	$opr_index = 0;
22	$opr_end = count((array)$opr);
23	$data_index = 0;
24	$data_end = count($data);
25
26	// Use path.GetCTM() if you are interested in CTM (current transformation matrix).
27
28	echo " Path Data Points := \"";
29	for (; $opr_index<$opr_end; ++$opr_index)
30	{
31		switch($opr[$opr_index])
32		{
33		case PathData::e_moveto:
34			$x1 = $data[$data_index]; ++$data_index;
35			$y1 = $data[$data_index]; ++$data_index;
36			$m_buf = sprintf("M%.5g %.5g", $x1, $y1);
37			echo $m_buf;
38			break;
39		case PathData::e_lineto:
40			$x1 = $data[$data_index]; ++$data_index;
41			$y1 = $data[$data_index]; ++$data_index;
42			$m_buf = sprintf(" L%.5g %.5g", $x1, $y1);
43			echo $m_buf;
44			break;
45		case PathData::e_cubicto:
46			$x1 = $data[$data_index]; ++$data_index;
47			$y1 = $data[$data_index]; ++$data_index;
48			$x2 = $data[$data_index]; ++$data_index;
49			$y2 = $data[$data_index]; ++$data_index;
50			$x3 = $data[$data_index]; ++$data_index;
51			$y3 = $data[$data_index]; ++$data_index;
52			$m_buf = sprintf(" C%.5g %.5g %.5g %.5g %.5g %.5g", $x1, $y1, $x2, $y2, $x3, $y3);
53			echo $m_buf;
54			break;
55		case PathData::e_rect:
56			{
57				$x1 = $data[$data_index]; ++$data_index;
58				$y1 = $data[$data_index]; ++$data_index;
59				$w = $data[$data_index]; ++$data_index;
60				$h = $data[$data_index]; ++$data_index;
61				$x2 = $x1 + $w;
62				$y2 = $y1;
63				$x3 = $x2;
64				$y3 = $y1 + $h;
65				$x4 = $x1; 
66				$y4 = $y3;
67				$m_buf = sprintf("M%.5g %.5g L%.5g %.5g L%.5g %.5g L%.5g %.5g Z", 
68					$x1, $y1, $x2, $y2, $x3, $y3, $x4, $y4);
69				echo $m_buf;
70			}
71			break;
72		case PathData::e_closepath:
73			echo nl2br(" Close Path\n");
74			break;
75		default: 
76			//assert(false);
77			break;
78		}	
79	}
80
81	echo "\" ";
82
83	$gs = $path->GetGState();
84
85	// Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
86	if ($path->IsStroked()) 
87	{
88		echo nl2br("Stroke path\n"); 
89
90		if ($gs->GetStrokeColorSpace()->GetType() == ColorSpace::e_pattern)
91		{
92			echo nl2br("Path has associated pattern\n"); 
93		}
94		else
95		{
96			// Get stroke color (you can use PDFNet color conversion facilities)
97			// $rgb = $gs->GetStrokeColorSpace()->Convert2RGB($gs->GetStrokeColor());
98		}
99	}
100	else 
101	{
102		// Do not stroke path
103	}
104
105	if ($path->IsFilled())
106	{
107		echo nl2br("Fill path\n"); 
108
109		if ($gs->GetFillColorSpace()->GetType() == ColorSpace::e_pattern)
110		{		
111			echo nl2br("Path has associated pattern\n"); 
112		}
113		else
114		{
115			// $rgb = $gs->GetFillColorSpace()->Convert2RGB($gs->GetFillColor());
116		}        
117	}
118	else 
119	{
120		// Do not fill path
121	}
122
123	// Process any changes in graphics state  ---------------------------------
124
125	$gs_itr = $reader->GetChangesIterator();
126	for (; $gs_itr->HasNext(); $gs_itr->Next()) 
127	{
128		switch($gs_itr->Current())
129		{
130		case GState::e_transform :
131			// Get transform matrix for this element. Unlike path.GetCTM() 
132			// that return full transformation matrix gs.GetTransform() return 
133			// only the transformation matrix that was installed for this element.
134			//
135			// $gs->GetTransform();
136			break;
137		case GState::e_line_width :
138			// $gs->GetLineWidth();
139			break;
140		case GState::e_line_cap :
141			// $gs->GetLineCap();
142			break;
143		case GState::e_line_join :
144			// $gs->GetLineJoin();
145			break;
146		case GState::e_flatness :	
147			break;
148		case GState::e_miter_limit :
149			// $gs->GetMiterLimit();
150			break;
151		case GState::e_dash_pattern :
152			{
153				// $dashes = $gs->GetDashes($dashes);
154				// $gs->GetPhase()
155			}
156			break;
157		case GState::e_fill_color:
158			{
159				if ( $gs->GetFillColorSpace()->GetType() == ColorSpace::e_pattern &&
160					$gs->GetFillPattern()->GetType() != PatternColor::e_shading )
161				{	
162					//process the pattern data
163					$reader->PatternBegin(true);
164					ProcessElements($reader);
165					$reader->End();
166				}
167			}
168			break;
169		}
170	}
171	$reader->ClearChangeList();
172}
173
174function ProcessText($page_reader) 
175{
176	// Begin text element
177	echo nl2br("Begin Text Block:\n");
178
179	while (($element = $page_reader->Next()) != NULL) 
180	{
181		switch ($element->GetType())
182		{
183		case Element::e_text_end: 
184			// Finish the text block
185			echo nl2br("End Text Block.\n");
186			return;
187
188		case Element::e_text:
189			{
190				$gs = $element->GetGState();
191
192				$cs_fill = $gs->GetFillColorSpace();
193				$fill = $gs->GetFillColor();
194
195				$out = $cs_fill->Convert2RGB($fill);
196
197				$cs_stroke = $gs->GetStrokeColorSpace();
198				$stroke = $gs->GetStrokeColor();
199
200				$font = $gs->GetFont();
201
202				echo nl2br("Font Name: ".$font->GetName()."\n");
203				// $font->IsFixedWidth();
204				// $font->IsSerif();
205				// $font->IsSymbolic();
206				// $font->IsItalic();
207				// ... 
208
209				// $font_size = $gs->GetFontSize();
210				// $word_spacing = $gs->GetWordSpacing();
211				// $char_spacing = $gs->GetCharSpacing();
212				// $txt = $element->GetTextString();
213
214				if ( $font->GetType() == Font::e_Type3 )
215				{
216					//type 3 font, process its data
217					for ($itr = $element->GetCharIterator(); $itr->HasNext(); $itr->Next()) 
218					{
219						$page_reader->Type3FontBegin($itr->Current());
220						ProcessElements($page_reader);
221						$page_reader->End();
222					}
223				}
224
225				else
226				{	
227					$text_mtx = $element->GetTextMatrix();
228					
229					for ($itr = $element->GetCharIterator(); $itr->HasNext(); $itr->Next()) 
230					{
231						$char_code = $itr->Current()->char_code;
232						if ($char_code>=32 || $char_code<=255) { // Print if in ASCII range...
233							echo chr($char_code);
234						}
235
236						$x = $itr->Current()->x;		// character positioning information
237						$y = $itr->Current()->y;
238						$pt = new Point($x, $y);
239
240						// Use element.GetCTM() if you are interested in the CTM 
241						// (current transformation matrix).
242						$ctm = $element->GetCTM();
243
244						// To get the exact character positioning information you need to 
245						// concatenate current text matrix with CTM and then multiply 
246						// relative positioning coordinates with the resulting matrix.
247						$mtx = $text_mtx;
248						$mtx->Concat($ctm->m_a, $ctm->m_b, $ctm->m_c, $ctm->m_d, $ctm->m_h, $ctm->m_v);
249						$mtx->Mult($pt);
250
251						// Get glyph path...
252						//$glyphPath = font.GetGlyphPath($char_code, false, 0);
253						//$oprs = $glyphPath->GetOperators();
254						//$glyph_data = $glyphPath->GetDataPoints();
255					}
256				}
257
258				echo nl2br("\n");
259			}
260			break;
261		}
262	}
263}
264
265function ProcessImage($image)  
266{
267	$image_mask = $image->IsImageMask();
268	$interpolate = $image->IsImageInterpolate();
269	$width = $image->GetImageWidth();
270	$height = $image->GetImageHeight();
271
272	$out_data_sz = $width * $height * 3;
273
274	echo "Image: " 
275		." width=\"".$width."\""
276		." height=\"".$height."\n";
277
278	// $mtx = $image->GetCTM(); // image matrix (page positioning info)
279
280	// You can use GetImageData to read the raw (decoded) image data
281	//$image->GetBitsPerComponent();	
282	//$image->GetImageData();	// get raw image data
283	// .... or use Image2RGB filter that converts every image to RGB format,
284	// This should save you time since you don't need to deal with color conversions, 
285	// image up-sampling, decoding etc.
286
287	$img_conv = new Image2RGB($image);	// Extract and convert image to RGB 8-bpc format
288	$reader = new FilterReader($img_conv);
289
290	// A buffer used to keep image data.
291	$image_data_out = $reader->Read($out_data_sz);
292	// $image_data_out contains RGB image data.
293
294	// Note that you don't need to read a whole image at a time. Alternatively
295	// you can read a chuck at a time by repeatedly calling reader.Read(buf_sz) 
296	// until the function returns 0. 
297}
298    
299function ProcessElements($reader) 
300{
301	while (($element = $reader->Next()) != NULL) 	// Read page contents
302	{
303		switch ($element->GetType())
304		{
305		case Element::e_path:						// Process path data...
306			{
307				ProcessPath($reader, $element);
308			}
309			break; 
310		case Element::e_text_begin: 				// Process text block...
311			{
312				ProcessText($reader);
313			}
314			break;
315		case Element::e_form:						// Process form XObjects
316			{
317				$reader->FormBegin(); 
318				ProcessElements($reader);
319				$reader->End();
320			}
321			break; 
322		case Element::e_image:						// Process Images
323			{
324				ProcessImage($element);
325			}	
326			break; 
327		}
328	}
329}
330
331	# Relative path to the folder containing the test files.
332	$input_path = getcwd()."/../../TestFiles/";
333	$output_path = $input_path."Output/";
334
335	PDFNet::Initialize($LicenseKey);
336	PDFNet::GetSystemFontList();    // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
337
338	# Extract text data from all pages in the document
339	echo nl2br("__________________________________________________\n");
340	echo nl2br("Extract page element information from all \n");
341	echo nl2br("pages in the document.\n");
342
343	$doc = new PDFDoc($input_path."newsletter.pdf");
344	$doc->InitSecurityHandler();
345
346	$pgnum = $doc->GetPageCount();
347	$page_begin = $doc->GetPageIterator();
348
349	$page_reader = new ElementReader();
350
351	for ($itr = $page_begin; $itr->HasNext(); $itr->Next())		//  Read every page
352	{				
353		echo nl2br("Page ".$itr->Current()->GetIndex()."----------------------------------------\n");
354		$page_reader->Begin($itr->Current());
355		ProcessElements($page_reader);
356		$page_reader->End();
357	}
358	$doc->Close();
359	PDFNet::Terminate();
360	echo nl2br("Done.\n");		
361?>

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12def ProcessPath(reader, path)
13	if path.IsClippingPath
14		puts "This is a clipping path"
15	end
16	
17	pathData = path.GetPathData
18	data = pathData.GetPoints
19	opr = pathData.GetOperators
20
21	opr_index = 0
22	opr_end = opr.size
23	data_index = 0
24	data_end = data.size
25
26	# Use path.GetCTM if you are interested in CTM (current transformation matrix).
27	print "Path Data Points := \""
28	
29	while opr_index < opr_end
30		case opr[opr_index].ord
31		when PathData::E_moveto
32			x1 = data[data_index] 
33			data_index = data_index + 1
34			y1 = data[data_index]
35			data_index = data_index + 1
36			puts "M" + x1.to_s + " " + y1.to_s
37		when PathData::E_lineto
38			x1 = data[data_index] 
39			data_index = data_index + 1
40			y1 = data[data_index]
41			data_index = data_index + 1
42			print " L" + x1.to_s + " " + y1.to_s
43		when PathData::E_cubicto
44			x1 = data[data_index]
45			data_index = data_index + 1
46			y1 = data[data_index]
47			data_index = data_index + 1
48			x2 = data[data_index]
49			data_index = data_index + 1
50			y2 = data[data_index]
51			data_index = data_index + 1
52			x3 = data[data_index]
53			data_index = data_index + 1
54			y3 = data[data_index]
55			data_index = data_index + 1
56			print " C" + x1.to_s + " " + y1.to_s + " " + x2.to_s + 
57				" " + y2.to_s + " " + x3.to_s + " " + y3.to_s
58		when PathData::E_rect
59			x1 = data[data_index]
60			data_index = data_index + 1
61			y1 = data[data_index]
62			data_index = data_index + 1
63			w = data[data_index]
64			data_index = data_index + 1
65			h = data[data_index]
66			data_index = data_index + 1
67			x2 = x1 + w
68			y2 = y1
69			x3 = x2
70			y3 = y1 + h
71			x4 = x1
72			y4 = y3
73			print "M" + x1.to_s + " " + y1.to_s + " L " + x2.to_s + " " + y2.to_s + " L " + 
74				x3.to_s + " " + y3.to_s + " L " + x4.to_s + " " + y4.to_s + " Z"
75		when PathData::E_closepath
76			puts " Close Path"
77		else
78			raise "Assert: false"
79		end
80		opr_index = opr_index + 1
81	end
82	
83	print "\" "
84	gs = path.GetGState
85	
86	# Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
87	if path.IsStroked
88		puts "Stroke path"
89		
90		if gs.GetStrokeColorSpace.GetType == ColorSpace::E_pattern
91			puts "Path has associated pattern"
92		else
93			# Get stroke color (you can use PDFNet color conversion facilities)
94			# rgb = gs.GetStrokeColorSpace.Convert2RGB(gs.GetStrokeColor)
95		end
96	else
97		# Do not stroke path
98	end
99		
100	if path.IsFilled
101		puts "Fill path"
102		
103		if gs.GetFillColorSpace.GetType == ColorSpace::E_pattern
104			puts "Path has associated pattern"
105		else
106			# rgb = gs.GetFillColorSpace.Convert2RGB(gs.GetFillColor)
107		end
108	else
109		# Do not fill path
110	end
111	
112	# Process any changes in graphics state  ---------------------------------
113	gs_itr = reader.GetChangesIterator
114	while gs_itr.HasNext do
115		case gs_itr.Current
116		when GState::E_transform
117			# Get transform matrix for this element. Unlike path.GetCTM 
118			# that return full transformation matrix gs.GetTransform return 
119			# only the transformation matrix that was installed for this element.
120			#
121			# gs.GetTransform
122		when GState::E_line_width
123			# gs.GetLineWidth
124		when GState::E_line_cap
125			# gs.GetLineCap
126		when GState::E_line_join
127			# gs.GetLineJoin
128		when GState::E_flatness
129		when GState::E_miter_limit
130			# gs.GetMiterLimit
131		when GState::E_dash_pattern
132			# dashes = gs.GetDashes
133			# gs.GetPhase
134		when GState::E_fill_color
135			if (gs.GetFillColorSpace.GetType == ColorSpace::E_pattern and
136				gs.GetFillPattern.GetType != PatternColor::E_shading )
137				# process the pattern data
138				reader.PatternBegin(true)
139				ProcessElements(reader)
140				reader.End
141			end
142		end
143		gs_itr.Next
144	end
145	reader.ClearChangeList
146end
147	
148def ProcessText (page_reader)
149	# Begin text element
150	puts "Begin Text Block:"
151	
152	element = page_reader.Next
153	
154	while !element.nil?
155		type = element.GetType
156		if type == Element::E_text_end
157			# Finish the text block
158			puts "End Text Block."
159			return
160		elsif type == Element::E_text
161			gs = element.GetGState
162			
163			cs_fill = gs.GetFillColorSpace
164			fill = gs.GetFillColor
165			
166			out = cs_fill.Convert2RGB(fill)
167			
168			cs_stroke = gs.GetStrokeColorSpace
169			stroke = gs.GetStrokeColor
170			
171			font = gs.GetFont
172			puts "Font Name: " + font.GetName
173			# font.IsFixedWidth
174			# font.IsSerif
175			# font.IsSymbolic
176			# font.IsItalic
177			# ... 
178
179			# font_size = gs.GetFontSize
180			# word_spacing = gs.GetWordSpacing
181			# char_spacing = gs.GetCharSpacing
182			# txt = element.GetTextString
183			if font.GetType == Font::E_Type3
184				# type 3 font, process its data
185				itr = element.GetCharIterator
186				while itr.HasNext do
187					page_reader.Type3FontBegin(itr.Current)
188					ProcessElements(page_reader)
189					page_reader.End
190				end
191			else
192				text_mtx = element.GetTextMatrix
193				
194				itr = element.GetCharIterator
195				while itr.HasNext do
196					char_code = itr.Current.char_code
197					if char_code>=32 and char_code<=255	 # Print if in ASCII range...
198						a = font.MapToUnicode(char_code)
199						print a[0]
200					end
201						
202					pt = Point.new   
203					pt.x = itr.Current.x	 # character positioning information
204					pt.y = itr.Current.y
205					
206					# Use element.GetCTM if you are interested in the CTM 
207					# (current transformation matrix).
208					ctm = element.GetCTM
209					
210					# To get the exact character positioning information you need to 
211					# concatenate current text matrix with CTM and then multiply 
212					# relative positioning coordinates with the resulting matrix.
213					mtx = ctm.Multiply(text_mtx)
214					mtx.Mult(pt)
215					itr.Next
216				end
217			end
218			puts ""
219		end
220		element = page_reader.Next
221	end
222end
223	
224def ProcessImage (image)
225	image_mask = image.IsImageMask
226	interpolate = image.IsImageInterpolate
227	width = image.GetImageWidth
228	height = image.GetImageHeight
229	out_data_sz = width * height * 3
230	
231	puts "Image: width=\"" + width.to_s + "\"" + " height=\"" + height.to_s
232	
233	# mtx = image.GetCTM # image matrix (page positioning info)
234
235	# You can use GetImageData to read the raw (decoded) image data
236	#image.GetBitsPerComponent	
237	#image.GetImageData	# get raw image data
238	# .... or use Image2RGB filter that converts every image to RGB format,
239	# This should save you time since you don't need to deal with color conversions, 
240	# image up-sampling, decoding etc.
241	
242	img_conv = Image2RGB.new(image)	 # Extract and convert image to RGB 8-bps format
243	reader = FilterReader.new(img_conv)
244
245	image_data_out = reader.Read(out_data_sz)
246	
247	# Note that you don't need to read a whole image at a time. Alternatively
248	# you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
249	# until the function returns 0. 
250end
251
252def ProcessElements(reader)
253	element = reader.Next	 # Read page contents
254	while !element.nil?
255		type = element.GetType
256		case type
257		when Element::E_path	  # Process path data...
258			ProcessPath(reader, element)
259		when Element::E_text_begin	  # Process text block...
260			ProcessText(reader)
261		when Element::E_form	# Process form XObjects
262			reader.FormBegin
263			ProcessElements(reader)
264			reader.End
265		when Element::E_image	# Process Images
266			ProcessImage(element)
267		end
268		element = reader.Next
269	end
270end
271
272	PDFNet.Initialize(PDFTronLicense.Key)
273	
274	# Relative path to the folder containing the test files.
275	input_path = "../../TestFiles/"
276	output_path = "../../TestFiles/Output/"
277	
278	# Extract text data from all pages in the document
279	
280	puts "__________________________________________________"
281	puts "Extract page element information from all "
282	puts "pages in the document."
283	
284
285	doc = PDFDoc.new(input_path + "newsletter.pdf")
286	doc.InitSecurityHandler
287	pgnum = doc.GetPageCount
288	page_begin = doc.GetPageIterator
289	page_reader = ElementReader.new
290	
291	itr = page_begin
292	while itr.HasNext do	# Read every page
293		puts "Page " + itr.Current.GetIndex.to_s + "----------------------------------------"
294		page_reader.Begin(itr.Current)
295		ProcessElements(page_reader)
296		page_reader.End
297		itr.Next
298	end
299	doc.Close
300	PDFNet.Terminate
301	puts "Done."

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14def ProcessPath(reader, path):
15    if path.IsClippingPath():
16        print("This is a clipping path")
17    
18    pathData = path.GetPathData()
19    data = pathData.GetPoints()
20    opr = pathData.GetOperators()
21
22    opr_index = 0
23    opr_end = len(opr)
24    data_index = 0
25    data_end = len(data)
26    
27    # Use path.GetCTM() if you are interested in CTM (current transformation matrix).
28    
29    sys.stdout.write("Path Data Points := \"")
30    
31    while opr_index < opr_end:
32        if opr[opr_index] == PathData.e_moveto:
33            x1 = data[data_index] 
34            data_index = data_index + 1
35            y1 = data[data_index]
36            data_index = data_index + 1
37            sys.stdout.write("M" + str(x1) + " " + str(y1))
38        elif opr[opr_index] == PathData.e_lineto:
39            x1 = data[data_index] 
40            data_index = data_index + 1
41            y1 = data[data_index]
42            data_index = data_index + 1
43            sys.stdout.write(" L" + str(x1) + " " + str(y1))
44        elif opr[opr_index] == PathData.e_cubicto:
45            x1 = data[data_index]
46            data_index = data_index + 1
47            y1 = data[data_index]
48            data_index = data_index + 1
49            x2 = data[data_index]
50            data_index = data_index + 1
51            y2 = data[data_index]
52            data_index = data_index + 1
53            x3 = data[data_index]
54            data_index = data_index + 1
55            y3 = data[data_index]
56            data_index = data_index + 1
57            sys.stdout.write(" C" + str(x1) + " " + str(y1) + " " + str(x2) + 
58                             " " + str(y2) + " " + str(x3) + " " + str(y3))
59        elif opr[opr_index] == PathData.e_rect:
60            x1 = data[data_index]
61            data_index = data_index + 1
62            y1 = data[data_index]
63            data_index = data_index + 1
64            w = data[data_index]
65            data_index = data_index + 1
66            h = data[data_index]
67            data_index = data_index + 1
68            x2 = x1 + w
69            y2 = y1
70            x3 = x2
71            y3 = y1 + h
72            x4 = x1
73            y4 = y3
74            sys.stdout.write("M" + str(x1) + " " + str(y1) + " L" + str(x2) + " " + str(y2) + " L" + 
75                             str(x3) + " " + str(y3) + " L" + str(x4) + " " + str(y4) + " Z")
76        elif opr[opr_index] == PathData.e_closepath:
77            print(" Close Path")
78        else:
79            assert(False)
80        opr_index = opr_index + 1
81    
82    sys.stdout.write("\" ")
83    gs = path.GetGState()
84    
85    # Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
86    if path.IsStroked():
87        print("Stroke path")
88        
89        if (gs.GetStrokeColorSpace().GetType() == ColorSpace.e_pattern):
90            print("Path has associated pattern")
91        else:
92            # Get stroke color (you can use PDFNet color conversion facilities)
93            # rgb = gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor())
94            pass
95    else:
96        pass;
97        # Do not stroke path
98        
99    if path.IsFilled():
100        print("Fill path")
101        
102        if (gs.GetFillColorSpace().GetType() == ColorSpace.e_pattern):
103            print("Path has associated pattern")
104        else:
105            # rgb = gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor())
106            pass
107    else:
108        pass
109        # Do not fill path
110    
111    # Process any changes in graphics state  ---------------------------------
112    gs_itr = reader.GetChangesIterator()
113    while gs_itr.HasNext():
114        if gs_itr.Current() == GState.e_transform:
115            # Get transform matrix for this element. Unlike path.GetCTM() 
116            # that return full transformation matrix gs.GetTransform() return 
117            # only the transformation matrix that was installed for this element.
118            #
119            # gs.GetTransform()
120            pass
121        elif gs_itr.Current() == GState.e_line_width:
122            # gs.GetLineWidth()
123            pass
124        elif gs_itr.Current() == GState.e_line_cap:
125            # gs.GetLineCap()
126            pass
127        elif gs_itr.Current() == GState.e_line_join:
128            # gs.GetLineJoin()
129            pass
130        elif gs_itr.Current() == GState.e_flatness:
131            pass
132        elif gs_itr.Current() == GState.e_miter_limit:
133            # gs.GetMiterLimit()
134            pass
135        elif gs_itr.Current() == GState.e_dash_pattern:
136            # dashes = gs.GetDashes()
137            # gs.GetPhase()
138            pass
139        elif gs_itr.Current() == GState.e_fill_color:
140            if (gs.GetFillColorSpace().GetType() == ColorSpace.e_pattern and
141                gs.GetFillPattern().GetType() != PatternColor.e_shading ):
142                # process the pattern data
143                reader.PatternBegin(True)
144                ProcessElements(reader)
145                reader.End()
146        gs_itr.Next()
147    reader.ClearChangeList()
148    
149def ProcessText (page_reader):
150    # Begin text element
151    print("Begin Text Block:")
152    
153    element = page_reader.Next()
154    
155    while element != None:
156        type = element.GetType()
157        if type == Element.e_text_end:
158            # Finish the text block
159            print("End Text Block.")
160            return
161        elif type == Element.e_text:
162            gs = element.GetGState()
163            
164            cs_fill = gs.GetFillColorSpace()
165            fill = gs.GetFillColor()
166            
167            out = cs_fill.Convert2RGB(fill)
168            
169            cs_stroke = gs.GetStrokeColorSpace()
170            stroke = gs.GetStrokeColor()
171            
172            font = gs.GetFont()
173            print("Font Name: " + font.GetName())
174            # font.IsFixedWidth()
175            # font.IsSerif()
176            # font.IsSymbolic()
177            # font.IsItalic()
178            # ... 
179
180            # font_size = gs.GetFontSize()
181            # word_spacing = gs.GetWordSpacing()
182            # char_spacing = gs.GetCharSpacing()
183            # txt = element.GetTextString()
184            if font.GetType() == Font.e_Type3:
185                # type 3 font, process its data
186                itr = element.GetCharIterator()
187                while itr.HasNext():
188                    page_reader.Type3FontBegin(itr.Current())
189                    ProcessElements(page_reader)
190                    page_reader.End()
191            else:
192                text_mtx = element.GetTextMatrix()
193                
194                itr = element.GetCharIterator()
195                while itr.HasNext():
196                    char_code = itr.Current().char_code
197                    if char_code>=32 and char_code<=255:     # Print if in ASCII range...
198                        a = font.MapToUnicode(char_code)
199                        sys.stdout.write( a[0] if sys.version_info.major < 3 else ascii(a[0]) )
200                        
201                    pt = Point()   
202                    pt.x = itr.Current().x     # character positioning information
203                    pt.y = itr.Current().y
204                    
205                    # Use element.GetCTM() if you are interested in the CTM 
206                    # (current transformation matrix).
207                    ctm = element.GetCTM()
208                    
209                    # To get the exact character positioning information you need to 
210                    # concatenate current text matrix with CTM and then multiply 
211                    # relative positioning coordinates with the resulting matrix.
212                    mtx = ctm.Multiply(text_mtx)
213                    mtx.Mult(pt)
214                    itr.Next()
215            print("")
216        element = page_reader.Next()
217    
218def ProcessImage (image):
219    image_mask = image.IsImageMask()
220    interpolate = image.IsImageInterpolate()
221    width = image.GetImageWidth()
222    height = image.GetImageHeight()
223    out_data_sz = width * height * 3
224    
225    print("Image: width=\"" + str(width) + "\"" + " height=\"" + str(height))
226    
227    # Matrix2D& mtx = image->GetCTM() # image matrix (page positioning info)
228
229    # You can use GetImageData to read the raw (decoded) image data
230    #image->GetBitsPerComponent()    
231    #image->GetImageData()    # get raw image data
232    # .... or use Image2RGB filter that converts every image to RGB format,
233    # This should save you time since you don't need to deal with color conversions, 
234    # image up-sampling, decoding etc.
235    
236    img_conv = Image2RGB(image)     # Extract and convert image to RGB 8-bps format
237    reader = FilterReader(img_conv)
238
239    image_data_out = reader.Read(out_data_sz)
240    
241    # Note that you don't need to read a whole image at a time. Alternatively
242    # you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
243    # until the function returns 0. 
244
245def ProcessElements(reader):
246    element = reader.Next()     # Read page contents
247    while element != None:
248        type = element.GetType()
249        if type == Element.e_path:      # Process path data...
250            ProcessPath(reader, element)
251        elif type == Element.e_text_begin:      # Process text block...
252            ProcessText(reader)
253        elif type == Element.e_form:    # Process form XObjects
254            reader.FormBegin()
255            ProcessElements(reader)
256            reader.End()
257        elif type == Element.e_image:    # Process Images
258            ProcessImage(element)
259        element = reader.Next()
260
261if __name__ == '__main__':
262    PDFNet.Initialize(LicenseKey)
263    
264    # Relative path to the folder containing the test files.
265    input_path = "../../TestFiles/"
266    output_path = "../../TestFiles/Output/"
267    
268    # Extract text data from all pages in the document
269    
270    print("__________________________________________________")
271    print("Extract page element information from all ")
272    print("pages in the document.")
273    
274    doc = PDFDoc(input_path + "newsletter.pdf")
275    doc.InitSecurityHandler()
276    pgnum = doc.GetPageCount()
277    page_begin = doc.GetPageIterator()
278    page_reader = ElementReader()
279    
280    itr = page_begin
281    while itr.HasNext():    # Read every page
282        print("Page " + str(itr.Current().GetIndex()) + "----------------------------------------")
283        page_reader.Begin(itr.Current())
284        ProcessElements(page_reader)
285        page_reader.End()
286        itr.Next()
287    doc.Close()
288    PDFNet.Terminate()
289    print("Done.")

1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4' A sample project illustrating some extraction capabilities of ElementReader
5' in more detail
6'
7
8Imports System
9
10Imports pdftron
11Imports pdftron.Common
12Imports pdftron.Filters
13Imports pdftron.SDF
14Imports pdftron.PDF
15
16Module ElementReaderAdvTestVB
17    Dim pdfNetLoader As PDFNetLoader
18    Sub New()
19        pdfNetLoader = pdftron.PDFNetLoader.Instance()
20    End Sub
21
22    Dim m_buf As String
23
24    Sub ProcessPath(ByRef reader As ElementReader, ByRef path As Element)
25        If path.IsClippingPath() Then
26            Console.WriteLine("This is a clipping path")
27        End If
28
29        Dim pathData As PathData = path.GetPathData()
30        Dim data As Double() = pathData.points
31        Dim data_sz As Integer = data.Length
32
33        Dim opr As Byte() = pathData.operators
34        Dim opr_sz As Integer = opr.Length
35
36        Dim opr_itr As Integer = 0
37        Dim opr_end As Integer = opr_sz
38        Dim data_itr As Integer = 0
39        Dim data_end As Integer = data_sz
40        Dim x1, y1, x2, y2, x3, y3 As Double
41
42        ' Use path.GetCTM() if you are interested in CTM (current transformation matrix).
43
44        Console.Write(" Path Data Points := \")
45        While opr_itr < opr_end
46            'switch((Element.PathSegmentType)((int)opr[opr_itr]))
47            If opr(opr_itr) = pathData.PathSegmentType.e_moveto Then
48                x1 = data(data_itr)
49                data_itr += 1
50                y1 = data(data_itr)
51                data_itr += 1
52                m_buf = String.Format("M{0:g5} {1:g5}", x1, y1)
53                Console.Write(m_buf)
54            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_lineto Then
55                x1 = data(data_itr)
56                data_itr += 1
57                y1 = data(data_itr)
58                data_itr += 1
59                m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1)
60                Console.Write(m_buf)
61            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_cubicto Then
62                x1 = data(data_itr)
63                data_itr += 1
64                y1 = data(data_itr)
65                data_itr += 1
66                x2 = data(data_itr)
67                data_itr += 1
68                y2 = data(data_itr)
69                data_itr += 1
70                x3 = data(data_itr)
71                data_itr += 1
72                y3 = data(data_itr)
73                data_itr += 1
74                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3}
75                m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", _
76                 coords)
77                Console.Write(m_buf)
78            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_rect Then
79                x1 = data(data_itr)
80                data_itr += 1
81                y1 = data(data_itr)
82                data_itr += 1
83                Dim w As Double = data(data_itr)
84                data_itr += 1
85                Dim h As Double = data(data_itr)
86                data_itr += 1
87                x2 = x1 + w
88                y2 = y1
89                x3 = x2
90                y3 = y1 + h
91                Dim x4 As Double = x1
92                Dim y4 As Double = y3
93                Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3, x4, y4}
94                m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", _
95                 coords)
96                Console.Write(m_buf)
97            ElseIf opr(opr_itr) = pathData.PathSegmentType.e_closepath Then
98                Console.WriteLine(" Close Path")
99            Else
100                System.Diagnostics.Debug.Assert(False)
101            End If
102
103            opr_itr += 1
104        End While
105
106        Console.Write(""" ")
107
108        Dim gs As GState = path.GetGState()
109
110        ' Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
111        If path.IsStroked() Then
112            Console.WriteLine("Stroke path")
113            If gs.GetStrokeColorSpace().GetType() = ColorSpace.Type.e_pattern Then
114                Console.WriteLine("Path has associated pattern")
115            Else
116                ' Get stroke color (you can use PDFNet color conversion facilities)
117                ' Dim rgb As ColorPt
118                ' gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb)
119            End If
120        Else
121            ' Do not stroke path
122        End If
123
124        If path.IsFilled() Then
125            Console.WriteLine("Fill path")
126
127            If gs.GetFillColorSpace().GetType() = ColorSpace.Type.e_pattern Then
128                Console.WriteLine("Path has associated pattern")
129            Else
130                ' Dim rgb As ColorPt
131                ' gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb)
132            End If
133        Else
134            ' Do not fill path
135        End If
136
137        ' Process any changes in graphics state  ---------------------------------
138        Dim gs_itr As GSChangesIterator = reader.GetChangesIterator()
139        While gs_itr.HasNext()
140            If gs_itr.Current() = GState.GStateAttribute.e_transform Then
141                ' Get transform matrix for this element. Unlike path.GetCTM() 
142                ' that return full transformation matrix gs.GetTransform() return 
143                ' only the transformation matrix that was installed for this element.
144                '
145                ' gs.GetTransform()
146            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_width Then
147                ' gs.GetLineWidth()
148            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_cap Then
149                ' gs.GetLineCap()
150            ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_join Then
151                ' gs.GetLineJoin()
152            ElseIf gs_itr.Current() = GState.GStateAttribute.e_flatness Then
153            ElseIf gs_itr.Current() = GState.GStateAttribute.e_miter_limit Then
154                ' gs.GetMiterLimit()
155            ElseIf gs_itr.Current() = GState.GStateAttribute.e_dash_pattern Then
156                ' Dim dashes As Double()
157                ' gs.GetDashes(dashes)
158                ' gs.GetPhase()
159            End If
160
161            gs_itr.Next()
162        End While
163    End Sub
164
165    Sub ProcessText(ByRef page_reader As ElementReader)
166        ' Begin text element
167        Console.WriteLine("Begin Text Block:")
168
169        Dim element As Element
170        element = page_reader.Next()
171        While Not IsNothing(element)
172            If element.GetType() = element.Type.e_text_end Then
173                ' Finish the text block
174                Console.WriteLine("End Text Block.")
175                Return
176            ElseIf element.GetType() = element.Type.e_text Then
177                Dim gs As GState = element.GetGState()
178
179                Dim cs_fill As ColorSpace = gs.GetFillColorSpace()
180                Dim fill As ColorPt = gs.GetFillColor()
181
182                Dim outc As ColorPt = New ColorPt
183                cs_fill.Convert2RGB(fill, outc)
184
185                Dim cs_stroke As ColorSpace = gs.GetStrokeColorSpace()
186                Dim stroke As ColorPt = gs.GetStrokeColor()
187
188                Dim font As Font = gs.GetFont()
189
190                Console.Write("Font Name: ")
191                Console.Write(font.GetName())
192                ' font.IsFixedWidth()
193                ' font.IsSerif()
194                ' font.IsSymbolic()
195                ' font.IsItalic()
196                ' ... 
197
198                ' Dim word_spacing As Double = gs.GetWordSpacing()
199                ' Dim char_spacing As Double = gs.GetCharSpacing()
200
201                ' Use element.GetCTM() if you are interested in the CTM 
202                ' (current transformation matrix).
203                Dim ctm As Matrix2D = element.GetCTM()
204
205                Dim text_mtx As Matrix2D = element.GetTextMatrix()
206
207                Dim mtx As Matrix2D = New Matrix2D
208                mtx.Set(ctm)
209                mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
210                Dim font_sz_scale_factor As Double = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)
211                Dim font_size As Double = gs.GetFontSize()
212                Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size)
213
214                Dim font_color As ColorPt = gs.GetFillColor()
215                Dim cs As ColorSpace = gs.GetFillColorSpace()
216
217                Dim rgb As ColorPt = New ColorPt
218                cs.Convert2RGB(font_color, rgb)
219
220                Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", _
221                    CByte(rgb.Get(0) * 255), CByte(rgb.Get(1) * 255), CByte(rgb.Get(2) * 255))
222
223                Dim x, y As Double
224                Dim char_code As Integer
225
226                Dim itr As CharIterator = element.GetCharIterator()
227                While itr.HasNext()
228                    Console.Write("Character code: ")
229                    char_code = itr.Current().char_code
230                    Console.Write(Chr(char_code))
231
232                    x = itr.Current().x      ' character positioning information
233                    y = itr.Current().y
234
235                    ' To get the exact character positioning information you need to 
236                    ' concatenate current text matrix with CTM and then multiply 
237                    ' relative positioning coordinates with the resulting matrix.
238                    '
239                    mtx.Set(ctm)
240                    mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
241                    mtx.Mult(x, y)
242                    Console.WriteLine(" Position: x={0:f} y={1:f}", x, y)
243                    itr.Next()
244                End While
245
246                Console.WriteLine()
247            End If
248            element = page_reader.Next()
249        End While
250    End Sub
251
252    Sub ProcessImage(ByRef image As Element)
253        Dim image_mask As Boolean = image.IsImageMask()
254        Dim interpolate As Boolean = image.IsImageInterpolate()
255        Dim width As Integer = image.GetImageWidth()
256        Dim height As Integer = image.GetImageHeight()
257        Dim out_data_sz As Integer = width * height * 3
258
259        Console.WriteLine("Image: width=""{0:d}"" height=""{1:d}""", width, height)
260
261        ' Dim mtx As Matrix2D = image.GetCTM() ' image matrix (page positioning info)
262
263        ' You can use GetImageData to read the raw (decoded) image data
264        'image.GetBitsPerComponent()    
265        'image.GetImageData()    ' get raw image data
266        ' .... or use Image2RGB filter that converts every image to RGB format,
267        ' This should save you time since you don't need to deal with color conversions, 
268        ' image up-sampling, decoding etc.
269
270        Dim img_conv As Image2RGB = New Image2RGB(image)       ' Extract and convert image to RGB 8-bpc format
271        Dim reader As FilterReader = New FilterReader(img_conv)
272
273        ' A buffer used to keep image data.
274        Dim image_data_out As Byte() = Nothing       '= New Byte(out_data_sz)
275
276        reader.Read(image_data_out)
277        ' image_data_out contains RGB image data.
278
279        ' Note that you don't need to read a whole image at a time. Alternatively
280        ' you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz) 
281        ' until the function returns 0. 
282    End Sub
283
284    Sub ProcessElements(ByRef reader As ElementReader)
285        Dim element As Element = reader.Next()
286
287        element = reader.Next()
288        While Not IsNothing(element)         ' Read page contents
289            If element.GetType() = element.Type.e_path Then
290                ' Process path data...
291                ProcessPath(reader, element)
292            ElseIf element.GetType() = element.Type.e_text_begin Then
293                ' Process text strings...
294                ProcessText(reader)
295            ElseIf element.GetType() = element.Type.e_form Then
296                ' Process form XObjects
297                reader.FormBegin()
298                ProcessElements(reader)
299                reader.End()
300            ElseIf element.GetType() = element.Type.e_image Then
301                ' Process Images
302                ProcessImage(element)
303            End If
304            element = reader.Next()
305        End While
306    End Sub
307
308    Sub Main()
309
310        PDFNet.Initialize(PDFTronLicense.Key)
311
312        ' Relative path to the folder containing test files.
313        Dim input_path As String = "../../../../TestFiles/"
314        ' Dim output_path As String = "../../../../TestFiles/Output/"
315
316        Console.WriteLine("-------------------------------------------------")
317        Console.WriteLine("Extract page element information from all")
318        Console.WriteLine("pages in the document.")
319
320        ' Open the test file
321        Console.WriteLine("Opening the input file...")
322        Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
323            doc.InitSecurityHandler()
324
325            Dim pgnum As Integer = doc.GetPageCount()
326
327            Dim itr As PageIterator
328            Using page_reader As ElementReader = New ElementReader
329                itr = doc.GetPageIterator()
330                While itr.HasNext()    '  Read every page
331                    Console.WriteLine("Page {0:d} ----------------------------------------", _
332                     itr.GetPageNumber())
333
334                    Dim crop_box As Rect = itr.Current().GetCropBox()
335                    Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2)
336                    Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height())
337
338                    page_reader.Begin(itr.Current())
339                    ProcessElements(page_reader)
340                    page_reader.End()
341                    itr.Next()
342                End While
343            End Using
344        End Using
345        PDFNet.Terminate()
346        Console.WriteLine("Done.")
347
348    End Sub
349
350End Module

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

PDF Data Extraction - Images, Text, Paths - Ruby Sample Code