Sample C# code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5// A sample project illustrating some extraction capabilities of ElementReader
6// in more detail
7//---------------------------------------------------------------------------------------
8
9using System;
10using pdftron;
11using pdftron.Common;
12using pdftron.Filters;
13using pdftron.SDF;
14using pdftron.PDF;
15
16namespace ElementReaderAdvTestCS
17{
18 /// <summary>
19 /// Summary description for Class1.
20 /// </summary>
21 class Class1
22 {
23 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
24 static Class1() {}
25
26 // Relative path to the folder containing test files.
27 static string input_path = "../../../../TestFiles/";
28 static string output_path = "../../../../TestFiles/Output/";
29
30 static string m_buf;
31
32 static public void ProcessPath(ElementReader reader, Element path)
33 {
34 if (path.IsClippingPath())
35 {
36 Console.WriteLine("This is a clipping path");
37 }
38
39 PathData pathData = path.GetPathData();
40 double[] data = pathData.points;
41 int data_sz = data.Length;
42
43 byte[] opr = pathData.operators;
44 int opr_sz = opr.Length;
45
46 int opr_itr = 0, opr_end = opr_sz;
47 int data_itr = 0, data_end = data_sz;
48 double x1, y1, x2, y2, x3, y3;
49
50 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
51
52 Console.Write(" Path Data Points := \"");
53 for ( ; opr_itr < opr_end; ++opr_itr)
54 {
55 switch((PathData.PathSegmentType)((int)opr[opr_itr]))
56 {
57 case PathData.PathSegmentType.e_moveto:
58 x1 = data[data_itr]; ++data_itr;
59 y1 = data[data_itr]; ++data_itr;
60 m_buf = string.Format("M{0:n0} {1:n0}", x1, y1);
61 Console.Write(m_buf);
62 break;
63 case PathData.PathSegmentType.e_lineto:
64 x1 = data[data_itr]; ++data_itr;
65 y1 = data[data_itr]; ++data_itr;
66 m_buf = string.Format(" L{0:n0} {1:n0}", x1, y1);
67 Console.Write(m_buf);
68 break;
69 case PathData.PathSegmentType.e_cubicto:
70 x1 = data[data_itr]; ++data_itr;
71 y1 = data[data_itr]; ++data_itr;
72 x2 = data[data_itr]; ++data_itr;
73 y2 = data[data_itr]; ++data_itr;
74 x3 = data[data_itr]; ++data_itr;
75 y3 = data[data_itr]; ++data_itr;
76 m_buf = string.Format(" C{0:n0} {1:n0} {2:n0} {3:n0} {4:n0} {5:n0}",
77 new object[] {x1, y1, x2, y2, x3, y3});
78 Console.Write(m_buf);
79 break;
80 case PathData.PathSegmentType.e_rect:
81 {
82 x1 = data[data_itr]; ++data_itr;
83 y1 = data[data_itr]; ++data_itr;
84 double w = data[data_itr]; ++data_itr;
85 double h = data[data_itr]; ++data_itr;
86 x2 = x1 + w;
87 y2 = y1;
88 x3 = x2;
89 y3 = y1 + h;
90 double x4 = x1;
91 double y4 = y3;
92 m_buf = string.Format("M{0:n0} {1:n0} L{2:n0} {3:n0} L{4:n0} {5:n0} L{6:n0} {7:n0} Z",
93 new object[] {x1, y1, x2, y2, x3, y3, x4, y4});
94 Console.Write(m_buf);
95 break;
96 }
97 case PathData.PathSegmentType.e_closepath:
98 Console.WriteLine(" Close Path");
99 break;
100 default:
101 System.Diagnostics.Debug.Assert(false);
102 break;
103 }
104 }
105
106 Console.Write("\" ");
107
108 GState gs = path.GetGState();
109
110 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
111 if (path.IsStroked())
112 {
113 Console.WriteLine("Stroke path");
114
115 if (gs.GetStrokeColorSpace().GetType() == ColorSpace.Type.e_pattern)
116 {
117 Console.WriteLine("Path has associated pattern");
118 }
119 else
120 {
121 // Get stroke color (you can use PDFNet color conversion facilities)
122 // ColorPt rgb = new ColorPt();
123 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
124 }
125 }
126 else
127 {
128 // Do not stroke path
129 }
130
131 if (path.IsFilled())
132 {
133 Console.WriteLine("Fill path");
134
135 if (gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern)
136 {
137 Console.WriteLine("Path has associated pattern");
138 }
139 else
140 {
141 // ColorPt rgb = new ColorPt();
142 // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
143 }
144 }
145 else
146 {
147 // Do not fill path
148 }
149
150 // Process any changes in graphics state ---------------------------------
151
152 GSChangesIterator gs_itr = reader.GetChangesIterator();
153 for ( ; gs_itr.HasNext(); gs_itr.Next())
154 {
155 switch(gs_itr.Current())
156 {
157 case GState.GStateAttribute.e_transform :
158 // Get transform matrix for this element. Unlike path.GetCTM()
159 // that return full transformation matrix gs.GetTransform() return
160 // only the transformation matrix that was installed for this element.
161 //
162 // gs.GetTransform();
163 break;
164 case GState.GStateAttribute.e_line_width :
165 // gs.GetLineWidth();
166 break;
167 case GState.GStateAttribute.e_line_cap :
168 // gs.GetLineCap();
169 break;
170 case GState.GStateAttribute.e_line_join :
171 // gs.GetLineJoin();
172 break;
173 case GState.GStateAttribute.e_flatness :
174 break;
175 case GState.GStateAttribute.e_miter_limit :
176 // gs.GetMiterLimit();
177 break;
178 case GState.GStateAttribute.e_dash_pattern :
179 {
180 // double[] dashes;
181 // gs.GetDashes(dashes);
182 // gs.GetPhase()
183 break;
184 }
185 case GState.GStateAttribute.e_fill_color:
186 {
187 if ( gs.GetFillColorSpace().GetType() == ColorSpace.Type.e_pattern &&
188 gs.GetFillPattern().GetType() != PatternColor.Type.e_shading)
189 {
190 //process the pattern data
191 reader.PatternBegin(true);
192 ProcessElements(reader);
193 reader.End();
194 }
195 break;
196 }
197 }
198 }
199 reader.ClearChangeList();
200 }
201
202 static public void ProcessText(ElementReader page_reader)
203 {
204 // Begin text element
205 Console.WriteLine("Begin Text Block:");
206
207 Element element;
208 while ((element = page_reader.Next()) != null)
209 {
210 switch (element.GetType())
211 {
212 case Element.Type.e_text_end:
213 // Finish the text block
214 Console.WriteLine("End Text Block.");
215 return;
216
217 case Element.Type.e_text:
218 {
219 GState gs = element.GetGState();
220
221 ColorSpace cs_fill = gs.GetFillColorSpace();
222 ColorPt fill = gs.GetFillColor();
223
224 ColorPt outc = new ColorPt();
225 cs_fill.Convert2RGB(fill, outc);
226
227
228 ColorSpace cs_stroke = gs.GetStrokeColorSpace();
229 ColorPt stroke = gs.GetStrokeColor();
230
231 Font font = gs.GetFont();
232
233 Console.Write("Font Name: ");
234 Console.WriteLine(font.GetName());
235 // font.IsFixedWidth();
236 // font.IsSerif();
237 // font.IsSymbolic();
238 // font.IsItalic();
239 // ...
240
241 // double word_spacing = gs.GetWordSpacing();
242 // double char_spacing = gs.GetCharSpacing();
243
244 // Use element.GetCTM() if you are interested in the CTM
245 // (current transformation matrix).
246 if (font.GetType() == Font.Type.e_Type3)
247 {
248 //type 3 font, process its data
249 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
250 {
251 page_reader.Type3FontBegin(itr.Current());
252 ProcessElements(page_reader);
253 page_reader.End();
254 }
255 }
256
257 else
258 {
259
260 Matrix2D ctm = element.GetCTM();
261
262 Matrix2D text_mtx = element.GetTextMatrix();
263
264 /*
265 Matrix2D mtx = ctm * text_mtx;
266 double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
267 double font_size = gs.GetFontSize();
268 Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size);
269
270 ColorPt font_color = gs.GetFillColor();
271 ColorSpace cs = gs.GetFillColorSpace();
272
273 ColorPt rgb = new ColorPt();
274 cs.Convert2RGB(font_color, rgb);
275 Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
276 (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
277
278
279 Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}",
280 (byte)(rgb.Get(0)*255),
281 (byte)(rgb.Get(1)*255),
282 (byte)(rgb.Get(2)*255));
283 */
284
285 double x, y;
286 int char_code;
287
288 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
289 {
290 Console.Write("Character code: ");
291 char_code = itr.Current().char_code;
292 if (char_code >= 32 || char_code <= 127)
293 {
294 // Print if in ASCII range...
295 Console.Write((char)char_code);
296 }
297
298 x = itr.Current().x; // character positioning information
299 y = itr.Current().y;
300
301 // To get the exact character positioning information you need to
302 // concatenate current text matrix with CTM and then multiply
303 // relative positioning coordinates with the resulting matrix.
304 //
305 Matrix2D mtx2 = ctm * text_mtx;
306 mtx2.Mult(ref x, ref y);
307 // Console.WriteLine(" Position: x={0:f} y={1:f}", x, y);
308 }
309 }
310
311 Console.WriteLine();
312 break;
313 }
314 }
315 }
316 }
317
318 static int image_counter = 0;
319
320 static public void ProcessImage(Element image)
321 {
322 bool image_mask = image.IsImageMask();
323 bool interpolate = image.IsImageInterpolate();
324 int width = image.GetImageWidth();
325 int height = image.GetImageHeight();
326 int out_data_sz = width * height * 3;
327
328 Console.WriteLine("Image: width=\"{0:d}\" height=\"{1:d}\"", width, height);
329
330 // Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)
331
332 ++image_counter;
333 System.Drawing.Bitmap bmp = image.GetBitmap();
334 bmp.Save(output_path + "reader_img_extract_" + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
335
336 // Alternatively you can use GetImageData to read the raw (decoded) image data
337 // image.GetBitsPerComponent();
338 // image.GetImageData(); // get raw image data
339 // another approach is to use Image2RGB filter that converts every image to
340 // RGB format. This could save you time since you don't need to deal with color
341 // conversions, image up-sampling, decoding etc.
342 // ----------------
343 // Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
344 // FilterReader reader = new FilterReader(img_conv); //
345 // byte[] image_data_out = new byte[out_data_sz]; // A buffer used to keep image data.
346 // reader.Read(image_data_out); // image_data_out contains RGB image data.
347 // ----------------
348 // Note that you don't need to read a whole image at a time. Alternatively
349 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
350 // until the function returns 0.
351 }
352
353 static void ProcessElements(ElementReader reader)
354 {
355 Element element;
356
357 while ((element = reader.Next()) != null) // Read page contents
358 {
359 switch (element.GetType())
360 {
361 case Element.Type.e_path: // Process path data...
362 {
363 ProcessPath(reader, element);
364 break;
365 }
366 case Element.Type.e_text_begin: // Process text strings...
367 {
368 ProcessText(reader);
369 break;
370 }
371 case Element.Type.e_form: // Process form XObjects
372 {
373 reader.FormBegin();
374 ProcessElements(reader);
375 reader.End();
376 break;
377 }
378 case Element.Type.e_image: // Process Images
379 {
380 ProcessImage(element);
381 break;
382 }
383 }
384 }
385 }
386
387 /// <summary>
388 /// The main entry point for the application.
389 /// </summary>
390 [STAThread]
391 static void Main(string[] args)
392 {
393 try
394 {
395 PDFNet.Initialize(PDFTronLicense.Key);
396
397 Console.WriteLine("-------------------------------------------------");
398 Console.WriteLine("Extract page element information from all ");
399 Console.WriteLine("pages in the document.");
400
401 // Open the test file
402 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
403 {
404 doc.InitSecurityHandler();
405
406 int pgnum = doc.GetPageCount();
407 PageIterator itr;
408
409 using (ElementReader page_reader = new ElementReader())
410 {
411 for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page
412 {
413 Console.WriteLine("Page {0:d}----------------------------------------",
414 itr.GetPageNumber());
415
416 Rect crop_box = itr.Current().GetCropBox();
417 crop_box.Normalize();
418
419 // Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2);
420 // Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height());
421
422 page_reader.Begin(itr.Current());
423 ProcessElements(page_reader);
424 page_reader.End();
425 }
426 }
427
428 Console.WriteLine("Done.");
429 }
430 }
431 catch (PDFNetException e)
432 {
433 Console.WriteLine(e.Message);
434 }
435 PDFNet.Terminate();
436 }
437 }
438}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Element.h>
10#include <PDF/Font.h>
11#include <Filters/FilterReader.h>
12#include <PDF/Image/Image2RGB.h>
13
14#include <iostream>
15#include <assert.h>
16#include "../../LicenseKey/CPP/LicenseKey.h"
17
18using namespace std;
19
20using namespace pdftron;
21using namespace PDF;
22using namespace SDF;
23using namespace Common;
24using namespace Filters;
25
26char m_buf[4000];
27
28void ProcessElements(ElementReader& reader);
29
30void ProcessPath(ElementReader& reader, Element path)
31{
32 if (path.IsClippingPath())
33 {
34 cout << "This is a clipping path" << endl;
35 }
36
37 PathData d = path.GetPathData();
38
39 const UChar* opr = &d.GetOperators().front();
40 const UChar *opr_itr = opr, *opr_end = opr + d.GetOperators().size();
41 const double* data = &d.GetPoints().front();
42 const double *data_itr = data, *data_end = data + d.GetPoints().size();
43
44 double x1, y1, x2, y2, x3, y3;
45
46 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
47
48 cout << " Path Data Points := \"";
49 for (; opr_itr<opr_end; ++opr_itr)
50 {
51 switch(*opr_itr)
52 {
53 case PathData::e_moveto:
54 x1 = *data_itr; ++data_itr;
55 y1 = *data_itr; ++data_itr;
56 sprintf(m_buf, "M%.0f %.0f", x1, y1);
57 cout << m_buf;
58 break;
59 case PathData::e_lineto:
60 x1 = *data_itr; ++data_itr;
61 y1 = *data_itr; ++data_itr;
62 sprintf(m_buf, " L%.0f %.0f", x1, y1);
63 cout << m_buf;
64 break;
65 case PathData::e_cubicto:
66 x1 = *data_itr; ++data_itr;
67 y1 = *data_itr; ++data_itr;
68 x2 = *data_itr; ++data_itr;
69 y2 = *data_itr; ++data_itr;
70 x3 = *data_itr; ++data_itr;
71 y3 = *data_itr; ++data_itr;
72 sprintf(m_buf, " C%.0f %.0f %.0f %.0f %.0f %.0f", x1, y1, x2, y2, x3, y3);
73 cout << m_buf;
74 break;
75 case PathData::e_rect:
76 {
77 x1 = *data_itr; ++data_itr;
78 y1 = *data_itr; ++data_itr;
79 double w = *data_itr; ++data_itr;
80 double h = *data_itr; ++data_itr;
81 x2 = x1 + w;
82 y2 = y1;
83 x3 = x2;
84 y3 = y1 + h;
85 double x4 = x1;
86 double y4 = y3;
87 sprintf(m_buf, "M%.0f %.0f L%.0f %.0f L%.0f %.0f L%.0f %.0f Z",
88 x1, y1, x2, y2, x3, y3, x4, y4);
89 cout << m_buf;
90 }
91 break;
92 case PathData::e_closepath:
93 cout << " Close Path" << endl;
94 break;
95 default:
96 assert(false);
97 break;
98 }
99 }
100
101 cout << "\" ";
102
103 GState gs = path.GetGState();
104
105 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
106 if (path.IsStroked())
107 {
108 cout << "Stroke path" << endl;
109
110 if (gs.GetStrokeColorSpace().GetType() == ColorSpace::e_pattern)
111 {
112 cout << "Path has associated pattern" << endl;
113 }
114 else
115 {
116 // Get stroke color (you can use PDFNet color conversion facilities)
117 // ColorPt rgb;
118 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
119 }
120 }
121 else
122 {
123 // Do not stroke path
124 }
125
126 if (path.IsFilled())
127 {
128 cout << "Fill path" << endl;
129
130 if (gs.GetFillColorSpace().GetType() == ColorSpace::e_pattern)
131 {
132 cout << "Path has associated pattern" << endl;
133 }
134 else
135 {
136 // ColorPt rgb;
137 // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
138 }
139 }
140 else
141 {
142 // Do not fill path
143 }
144
145 // Process any changes in graphics state ---------------------------------
146
147 GSChangesIterator gs_itr = reader.GetChangesIterator();
148 for (; gs_itr.HasNext(); gs_itr.Next())
149 {
150 switch(gs_itr.Current())
151 {
152 case GState::e_transform :
153 // Get transform matrix for this element. Unlike path.GetCTM()
154 // that return full transformation matrix gs.GetTransform() return
155 // only the transformation matrix that was installed for this element.
156 //
157 // gs.GetTransform();
158 break;
159 case GState::e_line_width :
160 // gs.GetLineWidth();
161 break;
162 case GState::e_line_cap :
163 // gs.GetLineCap();
164 break;
165 case GState::e_line_join :
166 // gs.GetLineJoin();
167 break;
168 case GState::e_flatness :
169 break;
170 case GState::e_miter_limit :
171 // gs.GetMiterLimit();
172 break;
173 case GState::e_dash_pattern :
174 {
175 // std::vector<double> dashes;
176 // gs.GetDashes(dashes);
177 // gs.GetPhase()
178 }
179 break;
180 case GState::e_fill_color:
181 {
182 if ( gs.GetFillColorSpace().GetType() == ColorSpace::e_pattern &&
183 gs.GetFillPattern().GetType() != PatternColor::e_shading )
184 {
185 //process the pattern data
186 reader.PatternBegin(true);
187 ProcessElements(reader);
188 reader.End();
189 }
190 }
191 break;
192 }
193 }
194 reader.ClearChangeList();
195}
196
197void ProcessText(ElementReader& page_reader)
198{
199 // Begin text element
200 cout << "Begin Text Block:" << endl;
201
202 Element element;
203 while ((element = page_reader.Next()) != 0)
204 {
205 switch (element.GetType())
206 {
207 case Element::e_text_end:
208 // Finish the text block
209 cout << "End Text Block." << endl;
210 return;
211
212 case Element::e_text:
213 {
214 GState gs = element.GetGState();
215
216 ColorSpace cs_fill = gs.GetFillColorSpace();
217 ColorPt fill = gs.GetFillColor();
218
219 ColorPt out;
220 cs_fill.Convert2RGB(fill, out);
221
222
223 ColorSpace cs_stroke = gs.GetStrokeColorSpace();
224 ColorPt stroke = gs.GetStrokeColor();
225
226 Font font = gs.GetFont();
227
228 cout << "Font Name: " << font.GetName() << endl;
229 // font.IsFixedWidth();
230 // font.IsSerif();
231 // font.IsSymbolic();
232 // font.IsItalic();
233 // ...
234
235 // double font_size = gs.GetFontSize();
236 // double word_spacing = gs.GetWordSpacing();
237 // double char_spacing = gs.GetCharSpacing();
238 // const UString* txt = element.GetTextString();
239
240 if ( font.GetType() == Font::e_Type3 )
241 {
242 //type 3 font, process its data
243 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
244 {
245 page_reader.Type3FontBegin(itr.Current());
246 ProcessElements(page_reader);
247 page_reader.End();
248 }
249 }
250
251 else
252 {
253 Matrix2D text_mtx = element.GetTextMatrix();
254 double x, y;
255 unsigned int char_code;
256
257 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
258 {
259 cout << "Character code: ";
260 char_code = itr.Current().char_code;
261 if (char_code>=32 || char_code<=127)
262 {
263 // Print if in ASCII range...
264 cout << char(char_code);
265 }
266
267 x = itr.Current().x; // character positioning information
268 y = itr.Current().y;
269
270 // Use element.GetCTM() if you are interested in the CTM
271 // (current transformation matrix).
272 Matrix2D ctm = element.GetCTM();
273
274 // To get the exact character positioning information you need to
275 // concatenate current text matrix with CTM and then multiply
276 // relative positioning coordinates with the resulting matrix.
277 Matrix2D mtx = ctm * text_mtx;
278 mtx.Mult(x, y);
279
280 // Get glyph path...
281 //vector<UChar> oprs;
282 //vector<double> glyph_data;
283 //font.GetGlyphPath(char_code, oprs, glyph_data, false, 0);
284 }
285 }
286
287 cout << endl;
288 }
289 break;
290 }
291 }
292}
293
294void ProcessImage(Element image)
295{
296 bool image_mask = image.IsImageMask();
297 bool interpolate = image.IsImageInterpolate();
298 int width = image.GetImageWidth();
299 int height = image.GetImageHeight();
300 int out_data_sz = width * height * 3;
301
302 cout << "Image:"
303 << " width=\"" << width << "\""
304 << " height=\"" << height << "\"" << endl;
305
306 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
307
308 // You can use GetImageData to read the raw (decoded) image data
309 //image->GetBitsPerComponent();
310 //image->GetImageData(); // get raw image data
311 // .... or use Image2RGB filter that converts every image to RGB format,
312 // This should save you time since you don't need to deal with color conversions,
313 // image up-sampling, decoding etc.
314
315 Image2RGB img_conv(image); // Extract and convert image to RGB 8-bpc format
316 FilterReader reader(img_conv);
317
318 // A buffer used to keep image data.
319 std::vector<UChar> image_data_out;
320 image_data_out.resize(out_data_sz);
321
322 reader.Read(&image_data_out.front(), out_data_sz);
323 // &image_data_out.front() contains RGB image data.
324
325 // Note that you don't need to read a whole image at a time. Alternatively
326 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
327 // until the function returns 0.
328}
329
330void ProcessElements(ElementReader& reader)
331{
332 Element element;
333 while ((element = reader.Next()) != 0) // Read page contents
334 {
335 switch (element.GetType())
336 {
337 case Element::e_path: // Process path data...
338 {
339 ProcessPath(reader, element);
340 }
341 break;
342 case Element::e_text_begin: // Process text block...
343 {
344 ProcessText(reader);
345 }
346 break;
347 case Element::e_form: // Process form XObjects
348 {
349 reader.FormBegin();
350 ProcessElements(reader);
351 reader.End();
352 }
353 break;
354 case Element::e_image: // Process Images
355 {
356 ProcessImage(element);
357 }
358 break;
359 }
360 }
361}
362
363int main(int argc, char *argv[])
364{
365 int ret = 0;
366 PDFNet::Initialize(LicenseKey);
367
368 // Relative path to the folder containing test files.
369 string input_path = "../../TestFiles/";
370 // string output_path = "../../TestFiles/Output/";
371
372
373 try // Extract text data from all pages in the document
374 {
375 cout << "-------------------------------------------------" << endl;
376 cout << "Extract page element information from all " << endl;
377 cout << "pages in the document." << endl;
378
379 PDFDoc doc((input_path + "newsletter.pdf").c_str());
380 doc.InitSecurityHandler();
381
382 int pgnum = doc.GetPageCount();
383 PageIterator page_begin = doc.GetPageIterator();
384
385 ElementReader page_reader;
386
387 PageIterator itr;
388 for (itr = page_begin; itr.HasNext(); itr.Next()) // Read every page
389 {
390 cout << "Page " << itr.Current().GetIndex() << "----------------------------------------" << endl;
391 page_reader.Begin(itr.Current());
392 ProcessElements(page_reader);
393 page_reader.End();
394 }
395
396 cout << "Done." << endl;
397 }
398 catch(Exception& e)
399 {
400 cout << e << endl;
401 ret = 1;
402 }
403 catch(...)
404 {
405 cout << "Unknown Exception" << endl;
406 ret = 1;
407 }
408
409 PDFNet::Terminate();
410 return ret;
411}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "os"
10 "strconv"
11 . "pdftron"
12)
13
14import "pdftron/Samples/LicenseKey/GO"
15
16func ProcessPath(reader ElementReader, path Element){
17 if path.IsClippingPath(){
18 fmt.Println("This is a clipping path")
19 }
20
21 pathData := path.GetPathData()
22 data := pathData.GetPoints()
23 opr := pathData.GetOperators()
24
25 oprIndex := 0
26 oprEnd := int(opr.Size())
27 dataIndex := 0
28 //dataEnd := data.Size()
29
30 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
31
32 os.Stdout.Write([]byte("Path Data Points := \""))
33 x1, x2, x3, x4 := 0.0, 0.0, 0.0, 0.0
34 y1, y2, y3, y4 := 0.0, 0.0, 0.0, 0.0
35 for oprIndex < oprEnd{
36 if int(opr.Get(oprIndex)) == int(PathDataE_moveto){
37 x1 = data.Get(dataIndex)
38 dataIndex = dataIndex + 1
39 y1 = data.Get(dataIndex)
40 dataIndex = dataIndex + 1
41 os.Stdout.Write([]byte("M" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
42 }else if int(opr.Get(oprIndex)) == int(PathDataE_lineto){
43 x1 = data.Get(dataIndex)
44 dataIndex = dataIndex + 1
45 y1 = data.Get(dataIndex)
46 dataIndex = dataIndex + 1
47 os.Stdout.Write([]byte(" L" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1)))
48 }else if int(opr.Get(oprIndex)) == int(PathDataE_cubicto){
49 x1 = data.Get(dataIndex)
50 dataIndex = dataIndex + 1
51 y1 = data.Get(dataIndex)
52 dataIndex = dataIndex + 1
53 x2 = data.Get(dataIndex)
54 dataIndex = dataIndex + 1
55 y2 = data.Get(dataIndex)
56 dataIndex = dataIndex + 1
57 x3 = data.Get(dataIndex)
58 dataIndex = dataIndex + 1
59 y3 = data.Get(dataIndex)
60 dataIndex = dataIndex + 1
61 os.Stdout.Write([]byte(" C" + fmt.Sprintf("%f", x1) + " " + fmt.Sprintf("%f", y1) + " " + fmt.Sprintf("%f", x2) + " " + fmt.Sprintf("%f", y2) + " " + fmt.Sprintf("%f", x3) + " " + fmt.Sprintf("%f", y3)))
62 }else if int(opr.Get(oprIndex)) == int(PathDataE_rect){
63 x1 = data.Get(dataIndex)
64 dataIndex = dataIndex + 1
65 y1 = data.Get(dataIndex)
66 dataIndex = dataIndex + 1
67 w := data.Get(dataIndex)
68 dataIndex = dataIndex + 1
69 h := data.Get(dataIndex)
70 dataIndex = dataIndex + 1
71 x2 = x1 + w
72 y2 = y1
73 x3 = x2
74 y3 = y1 + h
75 x4 = x1
76 y4 = y3
77 os.Stdout.Write([]byte("M" + fmt.Sprintf("%.2f", x1) + " " + fmt.Sprintf("%.2f", y1) + " L" + fmt.Sprintf("%.2f", x2) + " " + fmt.Sprintf("%.2f", y2) + " L" + fmt.Sprintf("%.2f", x3) + " " + fmt.Sprintf("%.2f", y3) + " L" + fmt.Sprintf("%.2f", x4) + " " + fmt.Sprintf("%.2f", y4) + " Z"))
78 }else if int(opr.Get(oprIndex)) == int(PathDataE_closepath){
79 fmt.Println(" Close Path")
80 }else{
81 //
82 }
83 oprIndex = oprIndex + 1
84 }
85
86 os.Stdout.Write([]byte("\" "))
87 gs := path.GetGState()
88
89 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
90 if path.IsStroked(){
91 fmt.Println("Stroke path")
92
93 if (gs.GetStrokeColorSpace().GetType() == ColorSpaceE_pattern){
94 fmt.Println("Path has associated pattern")
95 }else{
96 // Get stroke color (you can use PDFNet color conversion facilities)
97 // rgb = gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor())
98 }
99 }else{
100 // Do not stroke path
101 }
102
103 if path.IsFilled(){
104 fmt.Println("Fill path")
105
106 if (gs.GetFillColorSpace().GetType() == ColorSpaceE_pattern){
107 fmt.Println("Path has associated pattern")
108 }else{
109 // rgb = gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor())
110 }
111 }else{
112 // Do not fill path
113 }
114
115 // Process any changes in graphics state ---------------------------------
116 gsItr := reader.GetChangesIterator()
117 for gsItr.HasNext(){
118 if int(gsItr.Current()) == int(GStateE_transform){
119 // Get transform matrix for this element. Unlike path.GetCTM()
120 // that return full transformation matrix gs.GetTransform() return
121 // only the transformation matrix that was installed for this element.
122 //
123 // gs.GetTransform()
124
125 }else if int(gsItr.Current()) == int(GStateE_line_width){
126 // gs.GetLineWidth()
127
128 }else if int(gsItr.Current()) == int(GStateE_line_cap){
129 // gs.GetLineCap()
130
131 }else if int(gsItr.Current()) == int(GStateE_line_join){
132 // gs.GetLineJoin()
133
134 }else if int(gsItr.Current()) == int(GStateE_flatness){
135
136 }else if int(gsItr.Current()) == int(GStateE_miter_limit){
137 // gs.GetMiterLimit()
138
139 }else if int(gsItr.Current()) == int(GStateE_dash_pattern){
140 // dashes = gs.GetDashes()
141 // gs.GetPhase()
142
143 }else if int(gsItr.Current()) == int(GStateE_fill_color){
144 if (int(gs.GetFillColorSpace().GetType()) == int(ColorSpaceE_pattern) && int(gs.GetFillPattern().GetType()) != int(PatternColorE_shading) ){
145 // process the pattern data
146 reader.PatternBegin(true)
147 ProcessElements(reader)
148 reader.End()
149 }
150 }
151 gsItr.Next()
152 }
153 reader.ClearChangeList()
154}
155
156func ProcessText (pageReader ElementReader){
157 // Begin text element
158 fmt.Println("Begin Text Block:")
159
160 element := pageReader.Next()
161
162 for element.GetMp_elem().Swigcptr() != 0{
163 etype := element.GetType()
164 if etype == ElementE_text_end{
165 // Finish the text block
166 fmt.Println("End Text Block.")
167 return
168 }else if etype == ElementE_text{
169 gs := element.GetGState()
170
171 //csFill := gs.GetFillColorSpace()
172 //fill := gs.GetFillColor()
173
174 //out := csFill.Convert2RGB(fill)
175
176 //csStroke := gs.GetStrokeColorSpace()
177 //stroke := gs.GetStrokeColor()
178
179 font := gs.GetFont()
180 fmt.Println("Font Name: " + font.GetName())
181 // font.IsFixedWidth()
182 // font.IsSerif()
183 // font.IsSymbolic()
184 // font.IsItalic()
185 // ...
186
187 // fontSize = gs.GetFontSize()
188 // wordSpacing = gs.GetWordSpacing()
189 // charSpacing = gs.GetCharSpacing()
190 // txt := element.GetTextString()
191 if font.GetType() == FontE_Type3{
192 // type 3 font, process its data
193 itr := element.GetCharIterator()
194 for itr.HasNext(){
195 pageReader.Type3FontBegin(itr.Current())
196 ProcessElements(pageReader)
197 pageReader.End()
198 }
199 }else{
200 text_mtx := element.GetTextMatrix()
201
202 itr := element.GetCharIterator()
203 for itr.HasNext(){
204 charCode := itr.Current().GetChar_data()
205 if *charCode >= 32 && *charCode <= 255 { // Print if in ASCII range...
206 a := font.MapToUnicode(uint(*charCode))
207 os.Stdout.Write([]byte( a )) // Revisit: if sys.version_info.major < 3 else ascii(a[0]) ))
208 }
209 pt := NewPoint()
210 pt.SetX(itr.Current().GetX()) // character positioning information
211 pt.SetY(itr.Current().GetY())
212
213 // Use element.GetCTM() if you are interested in the CTM
214 // (current transformation matrix).
215 ctm := element.GetCTM()
216
217 // To get the exact character positioning information you need to
218 // concatenate current text matrix with CTM and then multiply
219 // relative positioning coordinates with the resulting matrix.
220 mtx := ctm.Multiply(text_mtx)
221 mtx.Mult(pt)
222 itr.Next()
223 }
224 }
225 fmt.Println("")
226 }
227 element = pageReader.Next()
228 }
229}
230
231func ProcessImage (image Element){
232 //imageMask := image.IsImageMask()
233 //interpolate := image.IsImageInterpolate()
234 width := image.GetImageWidth()
235 height := image.GetImageHeight()
236 outDataSz := width * height * 3
237
238 fmt.Println("Image: width=\"" + fmt.Sprintf("%d", width) + "\"" + " height=\"" + fmt.Sprintf("%d", height)+ "\"" )
239
240 // Matrix2D& mtx = image->GetCTM() // image matrix (page positioning info)
241
242 // You can use GetImageData to read the raw (decoded) image data
243 //image->GetBitsPerComponent()
244 //image->GetImageData() // get raw image data
245 // .... or use Image2RGB filter that converts every image to RGB format,
246 // This should save you time since you don't need to deal with color conversions,
247 // image up-sampling, decoding etc.
248
249 imgConv := NewImage2RGB(image) // Extract and convert image to RGB 8-bps format
250 reader := NewFilterReader(imgConv)
251
252 //imageDataOut := reader.Read(int64(outDataSz))
253 reader.Read(int64(outDataSz))
254
255 // Note that you don't need to read a whole image at a time. Alternatively
256 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
257 // until the function returns 0.
258}
259
260func ProcessElements(reader ElementReader){
261 element := reader.Next() // Read page contents
262 for element.GetMp_elem().Swigcptr() != 0{
263 etype := element.GetType()
264 if etype == ElementE_path{ // Process path data...
265 ProcessPath(reader, element)
266 }else if etype == ElementE_text_begin{ // Process text block...
267 ProcessText(reader)
268 }else if etype == ElementE_form{ // Process form XObjects
269 reader.FormBegin()
270 ProcessElements(reader)
271 reader.End()
272 }else if etype == ElementE_image{ // Process Images
273 ProcessImage(element)
274 }
275 element = reader.Next()
276 }
277}
278
279func main(){
280 PDFNetInitialize(PDFTronLicense.Key)
281
282 // Relative path to the folder containing the test files.
283 inputPath := "../../TestFiles/"
284 //outputPath := "../../TestFiles/Output/"
285
286 // Extract text data from all pages in the document
287
288 fmt.Println("__________________________________________________")
289 fmt.Println("Extract page element information from all ")
290 fmt.Println("pages in the document.")
291
292 doc := NewPDFDoc(inputPath + "newsletter.pdf")
293 doc.InitSecurityHandler()
294 //pgnum := doc.GetPageCount()
295 pageBegin := doc.GetPageIterator()
296 pageReader := NewElementReader()
297
298 itr := pageBegin
299 for itr.HasNext(){ // Read every page
300 fmt.Println("Page " + strconv.Itoa(itr.Current().GetIndex()) + "----------------------------------------")
301 pageReader.Begin(itr.Current())
302 ProcessElements(pageReader)
303 pageReader.End()
304 itr.Next()
305 }
306 doc.Close()
307 PDFNetTerminate()
308 fmt.Println("Done.")
309}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import com.pdftron.sdf.*;
7import com.pdftron.pdf.*;
8import com.pdftron.common.*;
9import com.pdftron.filters.FilterReader;
10
11
12public class ElementReaderAdvTest {
13
14 static String m_buf;
15
16 static void ProcessPath(ElementReader reader, Element path) throws PDFNetException {
17 if (path.isClippingPath()) {
18 System.out.println("This is a clipping path");
19 }
20
21 PathData pathData = path.getPathData();
22 double[] data = pathData.getPoints();
23 byte[] opr = pathData.getOperators();
24
25 double x1, y1, x2, y2, x3, y3;
26 // Use path.getCTM() if you are interested in CTM (current transformation matrix).
27
28 System.out.print(" Path Data Points := \"");
29 int data_index = 0;
30 for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
31 switch (opr[opr_index]) {
32 case PathData.e_moveto:
33 x1 = data[data_index];
34 ++data_index;
35 y1 = data[data_index];
36 ++data_index;
37 System.out.print("M" + x1 + " " + y1);
38 break;
39 case PathData.e_lineto:
40 x1 = data[data_index];
41 ++data_index;
42 y1 = data[data_index];
43 ++data_index;
44 System.out.print(" L" + x1 + " " + y1);
45
46 break;
47 case PathData.e_cubicto:
48 x1 = data[data_index];
49 ++data_index;
50 y1 = data[data_index];
51 ++data_index;
52 x2 = data[data_index];
53 ++data_index;
54 y2 = data[data_index];
55 ++data_index;
56 x3 = data[data_index];
57 ++data_index;
58 y3 = data[data_index];
59 ++data_index;
60 System.out.print(" C" + x1 + " " + y1 + " " + x2 + " " + y2 + " " + x3 + " " + y3);
61 break;
62 case PathData.e_rect: {
63 x1 = data[data_index];
64 ++data_index;
65 y1 = data[data_index];
66 ++data_index;
67 double w = data[data_index];
68 ++data_index;
69 double h = data[data_index];
70 ++data_index;
71 x2 = x1 + w;
72 y2 = y1;
73 x3 = x2;
74 y3 = y1 + h;
75 double x4 = x1;
76 double y4 = y3;
77 System.out.print("M" + x1 + " " + y1 + " L" + x2 + " " + y2 + " L" + x3 + " " + y3 + " L" + x4 + " " + y4 + " Z");
78 }
79 break;
80 case PathData.e_closepath:
81 System.out.println(" Close Path");
82 break;
83 default:
84 throw new PDFNetException("Invalid Element Type", 0, "", "", "");
85 }
86 }
87
88 System.out.print("\" ");
89
90 GState gs = path.getGState();
91
92 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
93 if (path.isStroked()) {
94 System.out.println("Stroke path");
95
96 if (gs.getStrokeColorSpace().getType() == ColorSpace.e_pattern) {
97 System.out.println("Path has associated pattern");
98 } else {
99 // Get stroke color (you can use PDFNet color conversion facilities)
100 ColorPt rgb = new ColorPt();
101 rgb = gs.getStrokeColor();
102 double v = rgb.get(0);
103 rgb = gs.getStrokeColorSpace().convert2RGB(rgb);
104 v = rgb.get(0);
105 }
106 } else {
107 // Do not stroke path
108 }
109
110 if (path.isFilled()) {
111 System.out.println("Fill path");
112
113 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern) {
114 System.out.println("Path has associated pattern");
115 PatternColor pat = gs.getFillPattern();
116 int type = pat.getType();
117 if (type == PatternColor.e_shading) {
118 System.out.println("Shading");
119 Shading shading = pat.getShading();
120 if (shading.getType() == Shading.e_function_shading) {
121 System.out.println("FUNCT");
122 } else if (shading.getType() == Shading.e_axial_shading) {
123 System.out.println("AXIAL");
124 } else if (shading.getType() == Shading.e_radial_shading) {
125 System.out.println("RADIAL");
126 }
127 } else if (type == PatternColor.e_colored_tiling_pattern) {
128 System.out.println("e_colored_tiling_pattern");
129 } else if (type == PatternColor.e_uncolored_tiling_pattern) {
130 System.out.println("e_uncolored_tiling_pattern");
131 } else {
132 System.out.println("?");
133 }
134 } else {
135 ColorPt rgb = new ColorPt();
136 rgb = gs.getFillColor();
137 double v = rgb.get(0);
138 rgb = gs.getFillColorSpace().convert2RGB(rgb);
139 v = rgb.get(0);
140 }
141 } else {
142 // Do not fill path
143 }
144
145 // Process any changes in graphics state ---------------------------------
146
147 GSChangesIterator gs_itr = reader.getChangesIterator();
148 while (gs_itr.hasNext()) {
149 switch (gs_itr.next().intValue()) {
150 case GState.e_transform:
151 // Get transform matrix for this element. Unlike path.GetCTM()
152 // that return full transformation matrix gs.GetTransform() return
153 // only the transformation matrix that was installed for this element.
154 //
155 //gs.getTransform();
156 break;
157 case GState.e_line_width:
158 //gs.getLineWidth();
159 break;
160 case GState.e_line_cap:
161 //gs.getLineCap();
162 break;
163 case GState.e_line_join:
164 //gs.getLineJoin();
165 break;
166 case GState.e_flatness:
167 break;
168 case GState.e_miter_limit:
169 //gs.getMiterLimit();
170 break;
171 case GState.e_dash_pattern: {
172 //double[] dashes;
173 //dashes=gs.getDashes();
174 //gs.getPhase();
175 }
176 break;
177 case GState.e_fill_color: {
178 if (gs.getFillColorSpace().getType() == ColorSpace.e_pattern &&
179 gs.getFillPattern().getType() != PatternColor.e_shading) {
180 //process the pattern data
181 reader.patternBegin(true);
182 ProcessElements(reader);
183 reader.end();
184 }
185 }
186 break;
187 }
188 }
189 reader.clearChangeList();
190 }
191
192 static void ProcessText(ElementReader page_reader) throws PDFNetException {
193 // Begin text element
194 System.out.println("Begin Text Block:");
195
196 Element element;
197 while ((element = page_reader.next()) != null) {
198 switch (element.getType()) {
199 case Element.e_text_end:
200 // Finish the text block
201 System.out.println("End Text Block.");
202 return;
203
204 case Element.e_text: {
205 GState gs = element.getGState();
206
207 ColorSpace cs_fill = gs.getFillColorSpace();
208 ColorPt fill = gs.getFillColor();
209
210 ColorPt out;
211 out = cs_fill.convert2RGB(fill);
212
213
214 ColorSpace cs_stroke = gs.getStrokeColorSpace();
215 ColorPt stroke = gs.getStrokeColor();
216
217 Font font = gs.getFont();
218
219 System.out.println("Font Name: " + font.getName());
220 //font.isFixedWidth();
221 //font.isSerif();
222 //font.isSymbolic();
223 //font.isItalic();
224 // ...
225
226 //double font_size = gs.getFontSize();
227 //double word_spacing = gs.getWordSpacing();
228 //double char_spacing = gs.getCharSpacing();
229 //String txt = element.getTextString();
230
231 if (font.getType() == Font.e_Type3) {
232 //type 3 font, process its data
233 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
234 page_reader.type3FontBegin(itr.next(), null);
235 ProcessElements(page_reader);
236 page_reader.end();
237 }
238 } else {
239 Matrix2D text_mtx = element.getTextMatrix();
240 double x, y;
241 long char_code;
242
243 for (CharIterator itr = element.getCharIterator(); itr.hasNext(); ) {
244 CharData data = itr.next();
245 char_code = data.getCharCode();
246 //System.out.print("Character code: ");
247
248 System.out.print(String.valueOf(char_code));
249
250 x = data.getGlyphX(); // character positioning information
251 y = data.getGlyphY();
252
253 // Use element.getCTM() if you are interested in the CTM
254 // (current transformation matrix).
255 Matrix2D ctm = element.getCTM();
256
257 // To get the exact character positioning information you need to
258 // concatenate current text matrix with CTM and then multiply
259 // relative positioning coordinates with the resulting matrix.
260 //
261 Matrix2D mtx = ctm.multiply(text_mtx);
262 java.awt.geom.Point2D.Double t = mtx.multPoint(x, y);
263 x = t.x;
264 y = t.y;
265 //System.out.println(" Position: x=" + x + " y=" + y );
266 }
267
268 System.out.println();
269 }
270 }
271 break;
272 }
273 }
274 }
275
276 static void ProcessImage(Element image) throws PDFNetException {
277 boolean image_mask = image.isImageMask();
278 boolean interpolate = image.isImageInterpolate();
279 int width = image.getImageWidth();
280 int height = image.getImageHeight();
281 int out_data_sz = width * height * 3;
282
283 System.out.println("Image: " +
284 " width=\"" + width + "\""
285 + " height=\"" + height);
286
287 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
288
289 // You can use GetImageData to read the raw (decoded) image data
290 //image->GetBitsPerComponent();
291 //image->GetImageData(); // get raw image data
292 // .... or use Image2RGB filter that converts every image to RGB format,
293 // This should save you time since you don't need to deal with color conversions,
294 // image up-sampling, decoding etc.
295
296 Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
297 FilterReader reader = new FilterReader(img_conv);
298
299 // A buffer used to keep image data.
300 byte[] buf = new byte[out_data_sz];
301 long image_data_out = reader.read(buf);
302 // &image_data_out.front() contains RGB image data.
303
304 // Note that you don't need to read a whole image at a time. Alternatively
305 // you can read a chunk at a time by repeatedly calling reader.Read(buf)
306 // until the function returns 0.
307 }
308
309 static void ProcessElements(ElementReader reader) throws PDFNetException {
310 Element element;
311 while ((element = reader.next()) != null) // Read page contents
312 {
313 switch (element.getType()) {
314 case Element.e_path: // Process path data...
315 {
316 ProcessPath(reader, element);
317 }
318 break;
319 case Element.e_text_begin: // Process text block...
320 {
321 ProcessText(reader);
322 }
323 break;
324 case Element.e_form: // Process form XObjects
325 {
326 reader.formBegin();
327 ProcessElements(reader);
328 reader.end();
329 }
330 break;
331 case Element.e_image: // Process Images
332 {
333 ProcessImage(element);
334 }
335 break;
336 }
337 }
338 }
339
340 public static void main(String[] args) {
341 PDFNet.initialize(PDFTronLicense.Key());
342
343 // Relative path to the folder containing test files.
344 String input_path = "../../TestFiles/";
345 // string output_path = "../../TestFiles/Output/";
346
347 System.out.println("__________________________________________________");
348 System.out.println("Extract page element information from all ");
349 System.out.println("pages in the document.");
350 try (PDFDoc doc = new PDFDoc((input_path + "newsletter.pdf"))) // Extract text data from all pages in the document
351 {
352 doc.initSecurityHandler();
353
354 int pgnum = doc.getPageCount();
355 PageIterator page_begin = doc.getPageIterator();
356
357 ElementReader page_reader = new ElementReader();
358
359 PageIterator itr;
360
361 for (itr = page_begin; itr.hasNext(); ) // Read every page
362 {
363 Page nextPage = itr.next();
364 System.out.println("Page " + nextPage.getIndex() +
365 "----------------------------------------");
366 page_reader.begin(nextPage);
367 ProcessElements(page_reader);
368 page_reader.end();
369 }
370 System.out.println("Done");
371 } catch (Exception e) {
372 System.out.println(e);
373 }
374
375 PDFNet.terminate();
376 }
377}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11
12 exports.runElementReaderAdvTest = () => {
13
14 const processPath = async (reader, path) => {
15 if (await path.isClippingPath()) {
16 console.log('This is a clipping path');
17 }
18
19 const d = await path.getPathData();
20
21 const opr = d.operators;
22 const opr_len = opr.byteLength;
23 const data = d.points;
24 let data_idx = 0, data_len = data.byteLength / data.BYTES_PER_ELEMENT;
25
26 let x1, y1, x2, y2, x3, y3;
27
28 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
29
30 let path_str = ' Path Data Points := "';
31 for (let opr_idx = 0; opr_idx < opr_len; ++opr_idx) {
32 switch (opr[opr_idx]) {
33 case PDFNet.Element.PathSegmentType.e_moveto:
34 x1 = data[data_idx]; ++data_idx;
35 y1 = data[data_idx]; ++data_idx;
36 path_str += 'M' + Math.round(x1) + ' ' + Math.round(y1);
37 break;
38 case PDFNet.Element.PathSegmentType.e_lineto:
39 x1 = data[data_idx]; ++data_idx;
40 y1 = data[data_idx]; ++data_idx;
41 path_str += 'L' + Math.round(x1) + ' ' + Math.round(y1);
42 break;
43 case PDFNet.Element.PathSegmentType.e_cubicto:
44 x1 = data[data_idx]; ++data_idx;
45 y1 = data[data_idx]; ++data_idx;
46 x2 = data[data_idx]; ++data_idx;
47 y2 = data[data_idx]; ++data_idx;
48 x3 = data[data_idx]; ++data_idx;
49 y3 = data[data_idx]; ++data_idx;
50 path_str += 'C' + Math.round(x1) + ' ' + Math.round(y1) + ' ' + Math.round(x2)
51 + ' ' + Math.round(y2) + ' ' + Math.round(x3) + ' ' + Math.round(y3);
52 break;
53 case PDFNet.Element.PathSegmentType.e_rect:
54 x1 = data[data_idx]; ++data_idx;
55 y1 = data[data_idx]; ++data_idx;
56 const w = data[data_idx]; ++data_idx;
57 const h = data[data_idx]; ++data_idx;
58 x2 = x1 + w;
59 y2 = y1;
60 x3 = x2;
61 y3 = y1 + h;
62 const x4 = x1;
63 const y4 = y3;
64 path_str += 'M' + Math.round(x1) + ' ' + Math.round(y1) + ' L' + Math.round(x2) + ' ' + Math.round(y2)
65 + ' L' + Math.round(x3) + ' ' + Math.round(y3) + ' L' + Math.round(x4) + ' ' + Math.round(y4) + ' Z';
66 break;
67 case PDFNet.Element.PathSegmentType.e_closepath:
68 path_str += ' Close Path\n';
69 break;
70 default:
71 throw ''
72 break;
73 }
74 }
75
76 path_str += '" ';
77
78 const gs = await path.getGState();
79
80 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
81 if (await path.isStroked()) {
82 console.log(path_str + 'Stroke path');
83 path_str = '';
84
85 if (await (await gs.getStrokeColorSpace()).getType() === PDFNet.ColorSpace.Type.e_pattern) {
86 console.log('Path has associated pattern');
87 } else {
88 // Get stroke color (you can use PDFNet color conversion facilities)
89 // ColorPt rgb;
90 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
91 }
92 } else {
93 // Do not stroke path
94 }
95
96 if (await path.isFilled()) {
97 console.log(path_str + 'Fill path');
98 path_str = '';
99
100 if (await (await gs.getFillColorSpace()).getType() === PDFNet.ColorSpace.Type.e_pattern) {
101 console.log('Path has associated pattern');
102 } else {
103 // ColorPt rgb;
104 // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
105 }
106 } else {
107 // Do not fill path
108 }
109
110 if (path_str) {
111 console.log(path_str);
112 }
113
114 // Process any changes in graphics state ---------------------------------
115
116 const gs_itr = await reader.getChangesIterator();
117 for (; await gs_itr.hasNext(); await gs_itr.next()) {
118 switch (await gs_itr.current()) {
119 case PDFNet.GState.Attribute.e_transform:
120 // Get transform matrix for this element. Unlike path.GetCTM()
121 // that return full transformation matrix gs.GetTransform() return
122 // only the transformation matrix that was installed for this element.
123 //
124 // gs.GetTransform();
125 break;
126 case PDFNet.GState.Attribute.e_line_width:
127 // gs.GetLineWidth();
128 break;
129 case PDFNet.GState.Attribute.e_line_cap:
130 // gs.GetLineCap();
131 break;
132 case PDFNet.GState.Attribute.e_line_join:
133 // gs.GetLineJoin();
134 break;
135 case PDFNet.GState.Attribute.e_flatness:
136 break;
137 case PDFNet.GState.Attribute.e_miter_limit:
138 // gs.GetMiterLimit();
139 break;
140 case PDFNet.GState.Attribute.e_dash_pattern:
141 {
142 // std::vector<double> dashes;
143 // gs.GetDashes(dashes);
144 // gs.GetPhase()
145 }
146 break;
147 case PDFNet.GState.Attribute.e_fill_color:
148 {
149 if (await (await gs.getFillColorSpace()).getType() === PDFNet.ColorSpace.Type.e_pattern &&
150 await (await gs.getFillPattern()).getType() !== PDFNet.PatternColor.Type.e_shading) {
151 //process the pattern data
152 await reader.patternBegin(true);
153 await processElements(reader);
154 await reader.end();
155 }
156 }
157 break;
158 }
159 }
160 await reader.clearChangeList();
161 };
162
163 const processText = async (pageReader) => {
164 // Begin text element
165 console.log('Begin Text Block:');
166
167 let element;
168 while (element = await pageReader.next()) {
169 switch (await element.getType()) {
170 case PDFNet.Element.Type.e_text_end:
171 // Finish the text block
172 console.log('End Text Block.');
173 return;
174
175 case PDFNet.Element.Type.e_text:
176 const gs = await element.getGState();
177
178 const cs_fill = await gs.getFillColorSpace();
179 const fill = await gs.getFillColor();
180
181 const out = await cs_fill.convert2RGB(fill);
182
183
184 const cs_stroke = await gs.getStrokeColorSpace();
185 const stroke = await gs.getStrokeColor();
186
187 const font = await gs.getFont();
188
189 console.log('Font Name: ' + await font.getName());
190
191 let outPutStr = '';
192 if (await font.getType() == PDFNet.Font.Type.e_Type3) {
193 //type 3 font, process its data
194 for (const itr = await element.getCharIterator(); await itr.hasNext(); await itr.next()) {
195 await pageReader.type3FontBegin(await itr.current());
196 await processElements(pageReader);
197 await pageReader.end();
198 }
199 } else {
200 const text_mtx = await element.getTextMatrix();
201
202 for (const itr = await element.getCharIterator(); await itr.hasNext(); await itr.next()) {
203 outPutStr += 'Character code: ';
204 const charData = await itr.current();
205 const charCode = charData.char_code;
206 if (charCode >= 32 || charCode <= 127) {
207 // Print if in ASCII range...
208 outPutStr += String.fromCharCode(charCode);
209 }
210
211 const x = charData.x; // character positioning information
212 const y = charData.y;
213
214 // Use element.GetCTM() if you are interested in the CTM
215 // (current transformation matrix).
216 const ctm = await element.getCTM();
217
218 // To get the exact character positioning information you need to
219 // concatenate current text matrix with CTM and then multiply
220 // relative positioning coordinates with the resulting matrix.
221 await ctm.multiply(text_mtx);
222 await ctm.mult(x, y);
223 }
224 }
225 console.log(outPutStr);
226 break;
227 }
228 }
229 };
230
231 const processImage = async (image) => {
232 const width = await image.getImageWidth();
233 const height = await image.getImageHeight();
234 const out_data_sz = await width * height * 3;
235
236 console.log('Image: width=\'' + width + '\' height=\'' + height + '\'');
237
238 const img_conv = await PDFNet.Filter.createImage2RGBFromElement(image); // Extract and convert image to RGB 8-bpc format
239 const reader = await PDFNet.FilterReader.create(img_conv);
240
241 const image_data_out = await reader.read(out_data_sz);
242
243 // Note that you don't need to read a whole image at a time. Alternatively
244 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
245 // until the function returns 0.
246 }
247
248 const processElements = async (reader) => {
249 let element;
250 while (element = await reader.next()) { // Read page contents
251 switch (await element.getType()) {
252 case PDFNet.Element.Type.e_path: // Process path data...
253 await processPath(reader, element);
254 break;
255 case PDFNet.Element.Type.e_text_begin: // Process text block...
256 await processText(reader);
257 break;
258 case PDFNet.Element.Type.e_form: // Process form XObjects
259 await reader.formBegin();
260 await processElements(reader);
261 await reader.end();
262 break;
263 case PDFNet.Element.Type.e_image: // Process Images
264 await processImage(element);
265 break;
266 }
267 }
268 }
269
270 const main = async () => {
271 // Relative path to the folder containing test files.
272 const inputPath = '../TestFiles/';
273 try {
274 console.log('-------------------------------------------------');
275 console.log('Extract page element information from all ');
276 console.log('pages in the document.');
277
278 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + 'newsletter.pdf');
279 doc.initSecurityHandler();
280
281 const pgnum = await doc.getPageCount();
282 const pageBegin = await doc.getPageIterator();
283
284 const pageReader = await PDFNet.ElementReader.create();
285
286 for (const itr = pageBegin; await itr.hasNext(); await itr.next()) // Read every page
287 {
288 const curPage = await itr.current();
289 console.log('Page ' + await curPage.getIndex() + '----------------------------------------');
290 await pageReader.beginOnPage(curPage);
291 await processElements(pageReader);
292 await pageReader.end();
293 }
294
295 console.log('Done.');
296 } catch (err) {
297 console.log(err);
298 }
299 };
300 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) { console.log('Error: ' + JSON.stringify(error)); }).then(function () { return PDFNet.shutdown(); });
301 };
302 exports.runElementReaderAdvTest();
303})(exports);
304// eslint-disable-next-line spaced-comment
305//# sourceURL=ElementReaderAdvTest.js
1<?php
2#---------------------------------------------------------------------------------------
3# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4# Consult LICENSE.txt regarding license information.
5#---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10function ProcessPath($reader, $path)
11{
12 if ($path->IsClippingPath())
13 {
14 echo nl2br("This is a clipping path\n");
15 }
16
17 $pathData = $path->GetPathData();
18 $data = $pathData->GetPoints();
19 $opr = $pathData->GetOperators();
20
21 $opr_index = 0;
22 $opr_end = count((array)$opr);
23 $data_index = 0;
24 $data_end = count($data);
25
26 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
27
28 echo " Path Data Points := \"";
29 for (; $opr_index<$opr_end; ++$opr_index)
30 {
31 switch($opr[$opr_index])
32 {
33 case PathData::e_moveto:
34 $x1 = $data[$data_index]; ++$data_index;
35 $y1 = $data[$data_index]; ++$data_index;
36 $m_buf = sprintf("M%.5g %.5g", $x1, $y1);
37 echo $m_buf;
38 break;
39 case PathData::e_lineto:
40 $x1 = $data[$data_index]; ++$data_index;
41 $y1 = $data[$data_index]; ++$data_index;
42 $m_buf = sprintf(" L%.5g %.5g", $x1, $y1);
43 echo $m_buf;
44 break;
45 case PathData::e_cubicto:
46 $x1 = $data[$data_index]; ++$data_index;
47 $y1 = $data[$data_index]; ++$data_index;
48 $x2 = $data[$data_index]; ++$data_index;
49 $y2 = $data[$data_index]; ++$data_index;
50 $x3 = $data[$data_index]; ++$data_index;
51 $y3 = $data[$data_index]; ++$data_index;
52 $m_buf = sprintf(" C%.5g %.5g %.5g %.5g %.5g %.5g", $x1, $y1, $x2, $y2, $x3, $y3);
53 echo $m_buf;
54 break;
55 case PathData::e_rect:
56 {
57 $x1 = $data[$data_index]; ++$data_index;
58 $y1 = $data[$data_index]; ++$data_index;
59 $w = $data[$data_index]; ++$data_index;
60 $h = $data[$data_index]; ++$data_index;
61 $x2 = $x1 + $w;
62 $y2 = $y1;
63 $x3 = $x2;
64 $y3 = $y1 + $h;
65 $x4 = $x1;
66 $y4 = $y3;
67 $m_buf = sprintf("M%.5g %.5g L%.5g %.5g L%.5g %.5g L%.5g %.5g Z",
68 $x1, $y1, $x2, $y2, $x3, $y3, $x4, $y4);
69 echo $m_buf;
70 }
71 break;
72 case PathData::e_closepath:
73 echo nl2br(" Close Path\n");
74 break;
75 default:
76 //assert(false);
77 break;
78 }
79 }
80
81 echo "\" ";
82
83 $gs = $path->GetGState();
84
85 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
86 if ($path->IsStroked())
87 {
88 echo nl2br("Stroke path\n");
89
90 if ($gs->GetStrokeColorSpace()->GetType() == ColorSpace::e_pattern)
91 {
92 echo nl2br("Path has associated pattern\n");
93 }
94 else
95 {
96 // Get stroke color (you can use PDFNet color conversion facilities)
97 // $rgb = $gs->GetStrokeColorSpace()->Convert2RGB($gs->GetStrokeColor());
98 }
99 }
100 else
101 {
102 // Do not stroke path
103 }
104
105 if ($path->IsFilled())
106 {
107 echo nl2br("Fill path\n");
108
109 if ($gs->GetFillColorSpace()->GetType() == ColorSpace::e_pattern)
110 {
111 echo nl2br("Path has associated pattern\n");
112 }
113 else
114 {
115 // $rgb = $gs->GetFillColorSpace()->Convert2RGB($gs->GetFillColor());
116 }
117 }
118 else
119 {
120 // Do not fill path
121 }
122
123 // Process any changes in graphics state ---------------------------------
124
125 $gs_itr = $reader->GetChangesIterator();
126 for (; $gs_itr->HasNext(); $gs_itr->Next())
127 {
128 switch($gs_itr->Current())
129 {
130 case GState::e_transform :
131 // Get transform matrix for this element. Unlike path.GetCTM()
132 // that return full transformation matrix gs.GetTransform() return
133 // only the transformation matrix that was installed for this element.
134 //
135 // $gs->GetTransform();
136 break;
137 case GState::e_line_width :
138 // $gs->GetLineWidth();
139 break;
140 case GState::e_line_cap :
141 // $gs->GetLineCap();
142 break;
143 case GState::e_line_join :
144 // $gs->GetLineJoin();
145 break;
146 case GState::e_flatness :
147 break;
148 case GState::e_miter_limit :
149 // $gs->GetMiterLimit();
150 break;
151 case GState::e_dash_pattern :
152 {
153 // $dashes = $gs->GetDashes($dashes);
154 // $gs->GetPhase()
155 }
156 break;
157 case GState::e_fill_color:
158 {
159 if ( $gs->GetFillColorSpace()->GetType() == ColorSpace::e_pattern &&
160 $gs->GetFillPattern()->GetType() != PatternColor::e_shading )
161 {
162 //process the pattern data
163 $reader->PatternBegin(true);
164 ProcessElements($reader);
165 $reader->End();
166 }
167 }
168 break;
169 }
170 }
171 $reader->ClearChangeList();
172}
173
174function ProcessText($page_reader)
175{
176 // Begin text element
177 echo nl2br("Begin Text Block:\n");
178
179 while (($element = $page_reader->Next()) != NULL)
180 {
181 switch ($element->GetType())
182 {
183 case Element::e_text_end:
184 // Finish the text block
185 echo nl2br("End Text Block.\n");
186 return;
187
188 case Element::e_text:
189 {
190 $gs = $element->GetGState();
191
192 $cs_fill = $gs->GetFillColorSpace();
193 $fill = $gs->GetFillColor();
194
195 $out = $cs_fill->Convert2RGB($fill);
196
197 $cs_stroke = $gs->GetStrokeColorSpace();
198 $stroke = $gs->GetStrokeColor();
199
200 $font = $gs->GetFont();
201
202 echo nl2br("Font Name: ".$font->GetName()."\n");
203 // $font->IsFixedWidth();
204 // $font->IsSerif();
205 // $font->IsSymbolic();
206 // $font->IsItalic();
207 // ...
208
209 // $font_size = $gs->GetFontSize();
210 // $word_spacing = $gs->GetWordSpacing();
211 // $char_spacing = $gs->GetCharSpacing();
212 // $txt = $element->GetTextString();
213
214 if ( $font->GetType() == Font::e_Type3 )
215 {
216 //type 3 font, process its data
217 for ($itr = $element->GetCharIterator(); $itr->HasNext(); $itr->Next())
218 {
219 $page_reader->Type3FontBegin($itr->Current());
220 ProcessElements($page_reader);
221 $page_reader->End();
222 }
223 }
224
225 else
226 {
227 $text_mtx = $element->GetTextMatrix();
228
229 for ($itr = $element->GetCharIterator(); $itr->HasNext(); $itr->Next())
230 {
231 $char_code = $itr->Current()->char_code;
232 if ($char_code>=32 || $char_code<=255) { // Print if in ASCII range...
233 echo chr($char_code);
234 }
235
236 $x = $itr->Current()->x; // character positioning information
237 $y = $itr->Current()->y;
238 $pt = new Point($x, $y);
239
240 // Use element.GetCTM() if you are interested in the CTM
241 // (current transformation matrix).
242 $ctm = $element->GetCTM();
243
244 // To get the exact character positioning information you need to
245 // concatenate current text matrix with CTM and then multiply
246 // relative positioning coordinates with the resulting matrix.
247 $mtx = $text_mtx;
248 $mtx->Concat($ctm->m_a, $ctm->m_b, $ctm->m_c, $ctm->m_d, $ctm->m_h, $ctm->m_v);
249 $mtx->Mult($pt);
250
251 // Get glyph path...
252 //$glyphPath = font.GetGlyphPath($char_code, false, 0);
253 //$oprs = $glyphPath->GetOperators();
254 //$glyph_data = $glyphPath->GetDataPoints();
255 }
256 }
257
258 echo nl2br("\n");
259 }
260 break;
261 }
262 }
263}
264
265function ProcessImage($image)
266{
267 $image_mask = $image->IsImageMask();
268 $interpolate = $image->IsImageInterpolate();
269 $width = $image->GetImageWidth();
270 $height = $image->GetImageHeight();
271
272 $out_data_sz = $width * $height * 3;
273
274 echo "Image: "
275 ." width=\"".$width."\""
276 ." height=\"".$height."\n";
277
278 // $mtx = $image->GetCTM(); // image matrix (page positioning info)
279
280 // You can use GetImageData to read the raw (decoded) image data
281 //$image->GetBitsPerComponent();
282 //$image->GetImageData(); // get raw image data
283 // .... or use Image2RGB filter that converts every image to RGB format,
284 // This should save you time since you don't need to deal with color conversions,
285 // image up-sampling, decoding etc.
286
287 $img_conv = new Image2RGB($image); // Extract and convert image to RGB 8-bpc format
288 $reader = new FilterReader($img_conv);
289
290 // A buffer used to keep image data.
291 $image_data_out = $reader->Read($out_data_sz);
292 // $image_data_out contains RGB image data.
293
294 // Note that you don't need to read a whole image at a time. Alternatively
295 // you can read a chuck at a time by repeatedly calling reader.Read(buf_sz)
296 // until the function returns 0.
297}
298
299function ProcessElements($reader)
300{
301 while (($element = $reader->Next()) != NULL) // Read page contents
302 {
303 switch ($element->GetType())
304 {
305 case Element::e_path: // Process path data...
306 {
307 ProcessPath($reader, $element);
308 }
309 break;
310 case Element::e_text_begin: // Process text block...
311 {
312 ProcessText($reader);
313 }
314 break;
315 case Element::e_form: // Process form XObjects
316 {
317 $reader->FormBegin();
318 ProcessElements($reader);
319 $reader->End();
320 }
321 break;
322 case Element::e_image: // Process Images
323 {
324 ProcessImage($element);
325 }
326 break;
327 }
328 }
329}
330
331 # Relative path to the folder containing the test files.
332 $input_path = getcwd()."/../../TestFiles/";
333 $output_path = $input_path."Output/";
334
335 PDFNet::Initialize($LicenseKey);
336 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
337
338 # Extract text data from all pages in the document
339 echo nl2br("__________________________________________________\n");
340 echo nl2br("Extract page element information from all \n");
341 echo nl2br("pages in the document.\n");
342
343 $doc = new PDFDoc($input_path."newsletter.pdf");
344 $doc->InitSecurityHandler();
345
346 $pgnum = $doc->GetPageCount();
347 $page_begin = $doc->GetPageIterator();
348
349 $page_reader = new ElementReader();
350
351 for ($itr = $page_begin; $itr->HasNext(); $itr->Next()) // Read every page
352 {
353 echo nl2br("Page ".$itr->Current()->GetIndex()."----------------------------------------\n");
354 $page_reader->Begin($itr->Current());
355 ProcessElements($page_reader);
356 $page_reader->End();
357 }
358 $doc->Close();
359 PDFNet::Terminate();
360 echo nl2br("Done.\n");
361?>
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12def ProcessPath(reader, path)
13 if path.IsClippingPath
14 puts "This is a clipping path"
15 end
16
17 pathData = path.GetPathData
18 data = pathData.GetPoints
19 opr = pathData.GetOperators
20
21 opr_index = 0
22 opr_end = opr.size
23 data_index = 0
24 data_end = data.size
25
26 # Use path.GetCTM if you are interested in CTM (current transformation matrix).
27 print "Path Data Points := \""
28
29 while opr_index < opr_end
30 case opr[opr_index].ord
31 when PathData::E_moveto
32 x1 = data[data_index]
33 data_index = data_index + 1
34 y1 = data[data_index]
35 data_index = data_index + 1
36 puts "M" + x1.to_s + " " + y1.to_s
37 when PathData::E_lineto
38 x1 = data[data_index]
39 data_index = data_index + 1
40 y1 = data[data_index]
41 data_index = data_index + 1
42 print " L" + x1.to_s + " " + y1.to_s
43 when PathData::E_cubicto
44 x1 = data[data_index]
45 data_index = data_index + 1
46 y1 = data[data_index]
47 data_index = data_index + 1
48 x2 = data[data_index]
49 data_index = data_index + 1
50 y2 = data[data_index]
51 data_index = data_index + 1
52 x3 = data[data_index]
53 data_index = data_index + 1
54 y3 = data[data_index]
55 data_index = data_index + 1
56 print " C" + x1.to_s + " " + y1.to_s + " " + x2.to_s +
57 " " + y2.to_s + " " + x3.to_s + " " + y3.to_s
58 when PathData::E_rect
59 x1 = data[data_index]
60 data_index = data_index + 1
61 y1 = data[data_index]
62 data_index = data_index + 1
63 w = data[data_index]
64 data_index = data_index + 1
65 h = data[data_index]
66 data_index = data_index + 1
67 x2 = x1 + w
68 y2 = y1
69 x3 = x2
70 y3 = y1 + h
71 x4 = x1
72 y4 = y3
73 print "M" + x1.to_s + " " + y1.to_s + " L " + x2.to_s + " " + y2.to_s + " L " +
74 x3.to_s + " " + y3.to_s + " L " + x4.to_s + " " + y4.to_s + " Z"
75 when PathData::E_closepath
76 puts " Close Path"
77 else
78 raise "Assert: false"
79 end
80 opr_index = opr_index + 1
81 end
82
83 print "\" "
84 gs = path.GetGState
85
86 # Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
87 if path.IsStroked
88 puts "Stroke path"
89
90 if gs.GetStrokeColorSpace.GetType == ColorSpace::E_pattern
91 puts "Path has associated pattern"
92 else
93 # Get stroke color (you can use PDFNet color conversion facilities)
94 # rgb = gs.GetStrokeColorSpace.Convert2RGB(gs.GetStrokeColor)
95 end
96 else
97 # Do not stroke path
98 end
99
100 if path.IsFilled
101 puts "Fill path"
102
103 if gs.GetFillColorSpace.GetType == ColorSpace::E_pattern
104 puts "Path has associated pattern"
105 else
106 # rgb = gs.GetFillColorSpace.Convert2RGB(gs.GetFillColor)
107 end
108 else
109 # Do not fill path
110 end
111
112 # Process any changes in graphics state ---------------------------------
113 gs_itr = reader.GetChangesIterator
114 while gs_itr.HasNext do
115 case gs_itr.Current
116 when GState::E_transform
117 # Get transform matrix for this element. Unlike path.GetCTM
118 # that return full transformation matrix gs.GetTransform return
119 # only the transformation matrix that was installed for this element.
120 #
121 # gs.GetTransform
122 when GState::E_line_width
123 # gs.GetLineWidth
124 when GState::E_line_cap
125 # gs.GetLineCap
126 when GState::E_line_join
127 # gs.GetLineJoin
128 when GState::E_flatness
129 when GState::E_miter_limit
130 # gs.GetMiterLimit
131 when GState::E_dash_pattern
132 # dashes = gs.GetDashes
133 # gs.GetPhase
134 when GState::E_fill_color
135 if (gs.GetFillColorSpace.GetType == ColorSpace::E_pattern and
136 gs.GetFillPattern.GetType != PatternColor::E_shading )
137 # process the pattern data
138 reader.PatternBegin(true)
139 ProcessElements(reader)
140 reader.End
141 end
142 end
143 gs_itr.Next
144 end
145 reader.ClearChangeList
146end
147
148def ProcessText (page_reader)
149 # Begin text element
150 puts "Begin Text Block:"
151
152 element = page_reader.Next
153
154 while !element.nil?
155 type = element.GetType
156 if type == Element::E_text_end
157 # Finish the text block
158 puts "End Text Block."
159 return
160 elsif type == Element::E_text
161 gs = element.GetGState
162
163 cs_fill = gs.GetFillColorSpace
164 fill = gs.GetFillColor
165
166 out = cs_fill.Convert2RGB(fill)
167
168 cs_stroke = gs.GetStrokeColorSpace
169 stroke = gs.GetStrokeColor
170
171 font = gs.GetFont
172 puts "Font Name: " + font.GetName
173 # font.IsFixedWidth
174 # font.IsSerif
175 # font.IsSymbolic
176 # font.IsItalic
177 # ...
178
179 # font_size = gs.GetFontSize
180 # word_spacing = gs.GetWordSpacing
181 # char_spacing = gs.GetCharSpacing
182 # txt = element.GetTextString
183 if font.GetType == Font::E_Type3
184 # type 3 font, process its data
185 itr = element.GetCharIterator
186 while itr.HasNext do
187 page_reader.Type3FontBegin(itr.Current)
188 ProcessElements(page_reader)
189 page_reader.End
190 end
191 else
192 text_mtx = element.GetTextMatrix
193
194 itr = element.GetCharIterator
195 while itr.HasNext do
196 char_code = itr.Current.char_code
197 if char_code>=32 and char_code<=255 # Print if in ASCII range...
198 a = font.MapToUnicode(char_code)
199 print a[0]
200 end
201
202 pt = Point.new
203 pt.x = itr.Current.x # character positioning information
204 pt.y = itr.Current.y
205
206 # Use element.GetCTM if you are interested in the CTM
207 # (current transformation matrix).
208 ctm = element.GetCTM
209
210 # To get the exact character positioning information you need to
211 # concatenate current text matrix with CTM and then multiply
212 # relative positioning coordinates with the resulting matrix.
213 mtx = ctm.Multiply(text_mtx)
214 mtx.Mult(pt)
215 itr.Next
216 end
217 end
218 puts ""
219 end
220 element = page_reader.Next
221 end
222end
223
224def ProcessImage (image)
225 image_mask = image.IsImageMask
226 interpolate = image.IsImageInterpolate
227 width = image.GetImageWidth
228 height = image.GetImageHeight
229 out_data_sz = width * height * 3
230
231 puts "Image: width=\"" + width.to_s + "\"" + " height=\"" + height.to_s
232
233 # mtx = image.GetCTM # image matrix (page positioning info)
234
235 # You can use GetImageData to read the raw (decoded) image data
236 #image.GetBitsPerComponent
237 #image.GetImageData # get raw image data
238 # .... or use Image2RGB filter that converts every image to RGB format,
239 # This should save you time since you don't need to deal with color conversions,
240 # image up-sampling, decoding etc.
241
242 img_conv = Image2RGB.new(image) # Extract and convert image to RGB 8-bps format
243 reader = FilterReader.new(img_conv)
244
245 image_data_out = reader.Read(out_data_sz)
246
247 # Note that you don't need to read a whole image at a time. Alternatively
248 # you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
249 # until the function returns 0.
250end
251
252def ProcessElements(reader)
253 element = reader.Next # Read page contents
254 while !element.nil?
255 type = element.GetType
256 case type
257 when Element::E_path # Process path data...
258 ProcessPath(reader, element)
259 when Element::E_text_begin # Process text block...
260 ProcessText(reader)
261 when Element::E_form # Process form XObjects
262 reader.FormBegin
263 ProcessElements(reader)
264 reader.End
265 when Element::E_image # Process Images
266 ProcessImage(element)
267 end
268 element = reader.Next
269 end
270end
271
272 PDFNet.Initialize(PDFTronLicense.Key)
273
274 # Relative path to the folder containing the test files.
275 input_path = "../../TestFiles/"
276 output_path = "../../TestFiles/Output/"
277
278 # Extract text data from all pages in the document
279
280 puts "__________________________________________________"
281 puts "Extract page element information from all "
282 puts "pages in the document."
283
284
285 doc = PDFDoc.new(input_path + "newsletter.pdf")
286 doc.InitSecurityHandler
287 pgnum = doc.GetPageCount
288 page_begin = doc.GetPageIterator
289 page_reader = ElementReader.new
290
291 itr = page_begin
292 while itr.HasNext do # Read every page
293 puts "Page " + itr.Current.GetIndex.to_s + "----------------------------------------"
294 page_reader.Begin(itr.Current)
295 ProcessElements(page_reader)
296 page_reader.End
297 itr.Next
298 end
299 doc.Close
300 PDFNet.Terminate
301 puts "Done."
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14def ProcessPath(reader, path):
15 if path.IsClippingPath():
16 print("This is a clipping path")
17
18 pathData = path.GetPathData()
19 data = pathData.GetPoints()
20 opr = pathData.GetOperators()
21
22 opr_index = 0
23 opr_end = len(opr)
24 data_index = 0
25 data_end = len(data)
26
27 # Use path.GetCTM() if you are interested in CTM (current transformation matrix).
28
29 sys.stdout.write("Path Data Points := \"")
30
31 while opr_index < opr_end:
32 if opr[opr_index] == PathData.e_moveto:
33 x1 = data[data_index]
34 data_index = data_index + 1
35 y1 = data[data_index]
36 data_index = data_index + 1
37 sys.stdout.write("M" + str(x1) + " " + str(y1))
38 elif opr[opr_index] == PathData.e_lineto:
39 x1 = data[data_index]
40 data_index = data_index + 1
41 y1 = data[data_index]
42 data_index = data_index + 1
43 sys.stdout.write(" L" + str(x1) + " " + str(y1))
44 elif opr[opr_index] == PathData.e_cubicto:
45 x1 = data[data_index]
46 data_index = data_index + 1
47 y1 = data[data_index]
48 data_index = data_index + 1
49 x2 = data[data_index]
50 data_index = data_index + 1
51 y2 = data[data_index]
52 data_index = data_index + 1
53 x3 = data[data_index]
54 data_index = data_index + 1
55 y3 = data[data_index]
56 data_index = data_index + 1
57 sys.stdout.write(" C" + str(x1) + " " + str(y1) + " " + str(x2) +
58 " " + str(y2) + " " + str(x3) + " " + str(y3))
59 elif opr[opr_index] == PathData.e_rect:
60 x1 = data[data_index]
61 data_index = data_index + 1
62 y1 = data[data_index]
63 data_index = data_index + 1
64 w = data[data_index]
65 data_index = data_index + 1
66 h = data[data_index]
67 data_index = data_index + 1
68 x2 = x1 + w
69 y2 = y1
70 x3 = x2
71 y3 = y1 + h
72 x4 = x1
73 y4 = y3
74 sys.stdout.write("M" + str(x1) + " " + str(y1) + " L" + str(x2) + " " + str(y2) + " L" +
75 str(x3) + " " + str(y3) + " L" + str(x4) + " " + str(y4) + " Z")
76 elif opr[opr_index] == PathData.e_closepath:
77 print(" Close Path")
78 else:
79 assert(False)
80 opr_index = opr_index + 1
81
82 sys.stdout.write("\" ")
83 gs = path.GetGState()
84
85 # Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
86 if path.IsStroked():
87 print("Stroke path")
88
89 if (gs.GetStrokeColorSpace().GetType() == ColorSpace.e_pattern):
90 print("Path has associated pattern")
91 else:
92 # Get stroke color (you can use PDFNet color conversion facilities)
93 # rgb = gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor())
94 pass
95 else:
96 pass;
97 # Do not stroke path
98
99 if path.IsFilled():
100 print("Fill path")
101
102 if (gs.GetFillColorSpace().GetType() == ColorSpace.e_pattern):
103 print("Path has associated pattern")
104 else:
105 # rgb = gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor())
106 pass
107 else:
108 pass
109 # Do not fill path
110
111 # Process any changes in graphics state ---------------------------------
112 gs_itr = reader.GetChangesIterator()
113 while gs_itr.HasNext():
114 if gs_itr.Current() == GState.e_transform:
115 # Get transform matrix for this element. Unlike path.GetCTM()
116 # that return full transformation matrix gs.GetTransform() return
117 # only the transformation matrix that was installed for this element.
118 #
119 # gs.GetTransform()
120 pass
121 elif gs_itr.Current() == GState.e_line_width:
122 # gs.GetLineWidth()
123 pass
124 elif gs_itr.Current() == GState.e_line_cap:
125 # gs.GetLineCap()
126 pass
127 elif gs_itr.Current() == GState.e_line_join:
128 # gs.GetLineJoin()
129 pass
130 elif gs_itr.Current() == GState.e_flatness:
131 pass
132 elif gs_itr.Current() == GState.e_miter_limit:
133 # gs.GetMiterLimit()
134 pass
135 elif gs_itr.Current() == GState.e_dash_pattern:
136 # dashes = gs.GetDashes()
137 # gs.GetPhase()
138 pass
139 elif gs_itr.Current() == GState.e_fill_color:
140 if (gs.GetFillColorSpace().GetType() == ColorSpace.e_pattern and
141 gs.GetFillPattern().GetType() != PatternColor.e_shading ):
142 # process the pattern data
143 reader.PatternBegin(True)
144 ProcessElements(reader)
145 reader.End()
146 gs_itr.Next()
147 reader.ClearChangeList()
148
149def ProcessText (page_reader):
150 # Begin text element
151 print("Begin Text Block:")
152
153 element = page_reader.Next()
154
155 while element != None:
156 type = element.GetType()
157 if type == Element.e_text_end:
158 # Finish the text block
159 print("End Text Block.")
160 return
161 elif type == Element.e_text:
162 gs = element.GetGState()
163
164 cs_fill = gs.GetFillColorSpace()
165 fill = gs.GetFillColor()
166
167 out = cs_fill.Convert2RGB(fill)
168
169 cs_stroke = gs.GetStrokeColorSpace()
170 stroke = gs.GetStrokeColor()
171
172 font = gs.GetFont()
173 print("Font Name: " + font.GetName())
174 # font.IsFixedWidth()
175 # font.IsSerif()
176 # font.IsSymbolic()
177 # font.IsItalic()
178 # ...
179
180 # font_size = gs.GetFontSize()
181 # word_spacing = gs.GetWordSpacing()
182 # char_spacing = gs.GetCharSpacing()
183 # txt = element.GetTextString()
184 if font.GetType() == Font.e_Type3:
185 # type 3 font, process its data
186 itr = element.GetCharIterator()
187 while itr.HasNext():
188 page_reader.Type3FontBegin(itr.Current())
189 ProcessElements(page_reader)
190 page_reader.End()
191 else:
192 text_mtx = element.GetTextMatrix()
193
194 itr = element.GetCharIterator()
195 while itr.HasNext():
196 char_code = itr.Current().char_code
197 if char_code>=32 and char_code<=255: # Print if in ASCII range...
198 a = font.MapToUnicode(char_code)
199 sys.stdout.write( a[0] if sys.version_info.major < 3 else ascii(a[0]) )
200
201 pt = Point()
202 pt.x = itr.Current().x # character positioning information
203 pt.y = itr.Current().y
204
205 # Use element.GetCTM() if you are interested in the CTM
206 # (current transformation matrix).
207 ctm = element.GetCTM()
208
209 # To get the exact character positioning information you need to
210 # concatenate current text matrix with CTM and then multiply
211 # relative positioning coordinates with the resulting matrix.
212 mtx = ctm.Multiply(text_mtx)
213 mtx.Mult(pt)
214 itr.Next()
215 print("")
216 element = page_reader.Next()
217
218def ProcessImage (image):
219 image_mask = image.IsImageMask()
220 interpolate = image.IsImageInterpolate()
221 width = image.GetImageWidth()
222 height = image.GetImageHeight()
223 out_data_sz = width * height * 3
224
225 print("Image: width=\"" + str(width) + "\"" + " height=\"" + str(height))
226
227 # Matrix2D& mtx = image->GetCTM() # image matrix (page positioning info)
228
229 # You can use GetImageData to read the raw (decoded) image data
230 #image->GetBitsPerComponent()
231 #image->GetImageData() # get raw image data
232 # .... or use Image2RGB filter that converts every image to RGB format,
233 # This should save you time since you don't need to deal with color conversions,
234 # image up-sampling, decoding etc.
235
236 img_conv = Image2RGB(image) # Extract and convert image to RGB 8-bps format
237 reader = FilterReader(img_conv)
238
239 image_data_out = reader.Read(out_data_sz)
240
241 # Note that you don't need to read a whole image at a time. Alternatively
242 # you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
243 # until the function returns 0.
244
245def ProcessElements(reader):
246 element = reader.Next() # Read page contents
247 while element != None:
248 type = element.GetType()
249 if type == Element.e_path: # Process path data...
250 ProcessPath(reader, element)
251 elif type == Element.e_text_begin: # Process text block...
252 ProcessText(reader)
253 elif type == Element.e_form: # Process form XObjects
254 reader.FormBegin()
255 ProcessElements(reader)
256 reader.End()
257 elif type == Element.e_image: # Process Images
258 ProcessImage(element)
259 element = reader.Next()
260
261if __name__ == '__main__':
262 PDFNet.Initialize(LicenseKey)
263
264 # Relative path to the folder containing the test files.
265 input_path = "../../TestFiles/"
266 output_path = "../../TestFiles/Output/"
267
268 # Extract text data from all pages in the document
269
270 print("__________________________________________________")
271 print("Extract page element information from all ")
272 print("pages in the document.")
273
274 doc = PDFDoc(input_path + "newsletter.pdf")
275 doc.InitSecurityHandler()
276 pgnum = doc.GetPageCount()
277 page_begin = doc.GetPageIterator()
278 page_reader = ElementReader()
279
280 itr = page_begin
281 while itr.HasNext(): # Read every page
282 print("Page " + str(itr.Current().GetIndex()) + "----------------------------------------")
283 page_reader.Begin(itr.Current())
284 ProcessElements(page_reader)
285 page_reader.End()
286 itr.Next()
287 doc.Close()
288 PDFNet.Terminate()
289 print("Done.")
1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4' A sample project illustrating some extraction capabilities of ElementReader
5' in more detail
6'
7
8Imports System
9
10Imports pdftron
11Imports pdftron.Common
12Imports pdftron.Filters
13Imports pdftron.SDF
14Imports pdftron.PDF
15
16Module ElementReaderAdvTestVB
17 Dim pdfNetLoader As PDFNetLoader
18 Sub New()
19 pdfNetLoader = pdftron.PDFNetLoader.Instance()
20 End Sub
21
22 Dim m_buf As String
23
24 Sub ProcessPath(ByRef reader As ElementReader, ByRef path As Element)
25 If path.IsClippingPath() Then
26 Console.WriteLine("This is a clipping path")
27 End If
28
29 Dim pathData As PathData = path.GetPathData()
30 Dim data As Double() = pathData.points
31 Dim data_sz As Integer = data.Length
32
33 Dim opr As Byte() = pathData.operators
34 Dim opr_sz As Integer = opr.Length
35
36 Dim opr_itr As Integer = 0
37 Dim opr_end As Integer = opr_sz
38 Dim data_itr As Integer = 0
39 Dim data_end As Integer = data_sz
40 Dim x1, y1, x2, y2, x3, y3 As Double
41
42 ' Use path.GetCTM() if you are interested in CTM (current transformation matrix).
43
44 Console.Write(" Path Data Points := \")
45 While opr_itr < opr_end
46 'switch((Element.PathSegmentType)((int)opr[opr_itr]))
47 If opr(opr_itr) = pathData.PathSegmentType.e_moveto Then
48 x1 = data(data_itr)
49 data_itr += 1
50 y1 = data(data_itr)
51 data_itr += 1
52 m_buf = String.Format("M{0:g5} {1:g5}", x1, y1)
53 Console.Write(m_buf)
54 ElseIf opr(opr_itr) = pathData.PathSegmentType.e_lineto Then
55 x1 = data(data_itr)
56 data_itr += 1
57 y1 = data(data_itr)
58 data_itr += 1
59 m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1)
60 Console.Write(m_buf)
61 ElseIf opr(opr_itr) = pathData.PathSegmentType.e_cubicto Then
62 x1 = data(data_itr)
63 data_itr += 1
64 y1 = data(data_itr)
65 data_itr += 1
66 x2 = data(data_itr)
67 data_itr += 1
68 y2 = data(data_itr)
69 data_itr += 1
70 x3 = data(data_itr)
71 data_itr += 1
72 y3 = data(data_itr)
73 data_itr += 1
74 Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3}
75 m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}", _
76 coords)
77 Console.Write(m_buf)
78 ElseIf opr(opr_itr) = pathData.PathSegmentType.e_rect Then
79 x1 = data(data_itr)
80 data_itr += 1
81 y1 = data(data_itr)
82 data_itr += 1
83 Dim w As Double = data(data_itr)
84 data_itr += 1
85 Dim h As Double = data(data_itr)
86 data_itr += 1
87 x2 = x1 + w
88 y2 = y1
89 x3 = x2
90 y3 = y1 + h
91 Dim x4 As Double = x1
92 Dim y4 As Double = y3
93 Dim coords() As Object = New Object() {x1, y1, x2, y2, x3, y3, x4, y4}
94 m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z", _
95 coords)
96 Console.Write(m_buf)
97 ElseIf opr(opr_itr) = pathData.PathSegmentType.e_closepath Then
98 Console.WriteLine(" Close Path")
99 Else
100 System.Diagnostics.Debug.Assert(False)
101 End If
102
103 opr_itr += 1
104 End While
105
106 Console.Write(""" ")
107
108 Dim gs As GState = path.GetGState()
109
110 ' Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
111 If path.IsStroked() Then
112 Console.WriteLine("Stroke path")
113 If gs.GetStrokeColorSpace().GetType() = ColorSpace.Type.e_pattern Then
114 Console.WriteLine("Path has associated pattern")
115 Else
116 ' Get stroke color (you can use PDFNet color conversion facilities)
117 ' Dim rgb As ColorPt
118 ' gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb)
119 End If
120 Else
121 ' Do not stroke path
122 End If
123
124 If path.IsFilled() Then
125 Console.WriteLine("Fill path")
126
127 If gs.GetFillColorSpace().GetType() = ColorSpace.Type.e_pattern Then
128 Console.WriteLine("Path has associated pattern")
129 Else
130 ' Dim rgb As ColorPt
131 ' gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb)
132 End If
133 Else
134 ' Do not fill path
135 End If
136
137 ' Process any changes in graphics state ---------------------------------
138 Dim gs_itr As GSChangesIterator = reader.GetChangesIterator()
139 While gs_itr.HasNext()
140 If gs_itr.Current() = GState.GStateAttribute.e_transform Then
141 ' Get transform matrix for this element. Unlike path.GetCTM()
142 ' that return full transformation matrix gs.GetTransform() return
143 ' only the transformation matrix that was installed for this element.
144 '
145 ' gs.GetTransform()
146 ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_width Then
147 ' gs.GetLineWidth()
148 ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_cap Then
149 ' gs.GetLineCap()
150 ElseIf gs_itr.Current() = GState.GStateAttribute.e_line_join Then
151 ' gs.GetLineJoin()
152 ElseIf gs_itr.Current() = GState.GStateAttribute.e_flatness Then
153 ElseIf gs_itr.Current() = GState.GStateAttribute.e_miter_limit Then
154 ' gs.GetMiterLimit()
155 ElseIf gs_itr.Current() = GState.GStateAttribute.e_dash_pattern Then
156 ' Dim dashes As Double()
157 ' gs.GetDashes(dashes)
158 ' gs.GetPhase()
159 End If
160
161 gs_itr.Next()
162 End While
163 End Sub
164
165 Sub ProcessText(ByRef page_reader As ElementReader)
166 ' Begin text element
167 Console.WriteLine("Begin Text Block:")
168
169 Dim element As Element
170 element = page_reader.Next()
171 While Not IsNothing(element)
172 If element.GetType() = element.Type.e_text_end Then
173 ' Finish the text block
174 Console.WriteLine("End Text Block.")
175 Return
176 ElseIf element.GetType() = element.Type.e_text Then
177 Dim gs As GState = element.GetGState()
178
179 Dim cs_fill As ColorSpace = gs.GetFillColorSpace()
180 Dim fill As ColorPt = gs.GetFillColor()
181
182 Dim outc As ColorPt = New ColorPt
183 cs_fill.Convert2RGB(fill, outc)
184
185 Dim cs_stroke As ColorSpace = gs.GetStrokeColorSpace()
186 Dim stroke As ColorPt = gs.GetStrokeColor()
187
188 Dim font As Font = gs.GetFont()
189
190 Console.Write("Font Name: ")
191 Console.Write(font.GetName())
192 ' font.IsFixedWidth()
193 ' font.IsSerif()
194 ' font.IsSymbolic()
195 ' font.IsItalic()
196 ' ...
197
198 ' Dim word_spacing As Double = gs.GetWordSpacing()
199 ' Dim char_spacing As Double = gs.GetCharSpacing()
200
201 ' Use element.GetCTM() if you are interested in the CTM
202 ' (current transformation matrix).
203 Dim ctm As Matrix2D = element.GetCTM()
204
205 Dim text_mtx As Matrix2D = element.GetTextMatrix()
206
207 Dim mtx As Matrix2D = New Matrix2D
208 mtx.Set(ctm)
209 mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
210 Dim font_sz_scale_factor As Double = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d)
211 Dim font_size As Double = gs.GetFontSize()
212 Console.Write(" Font Size: {0:f}", font_sz_scale_factor * font_size)
213
214 Dim font_color As ColorPt = gs.GetFillColor()
215 Dim cs As ColorSpace = gs.GetFillColorSpace()
216
217 Dim rgb As ColorPt = New ColorPt
218 cs.Convert2RGB(font_color, rgb)
219
220 Console.WriteLine(" Font Color(RGB): red={0:d} green={1:d} blue={2:d}", _
221 CByte(rgb.Get(0) * 255), CByte(rgb.Get(1) * 255), CByte(rgb.Get(2) * 255))
222
223 Dim x, y As Double
224 Dim char_code As Integer
225
226 Dim itr As CharIterator = element.GetCharIterator()
227 While itr.HasNext()
228 Console.Write("Character code: ")
229 char_code = itr.Current().char_code
230 Console.Write(Chr(char_code))
231
232 x = itr.Current().x ' character positioning information
233 y = itr.Current().y
234
235 ' To get the exact character positioning information you need to
236 ' concatenate current text matrix with CTM and then multiply
237 ' relative positioning coordinates with the resulting matrix.
238 '
239 mtx.Set(ctm)
240 mtx.Concat(text_mtx.m_a, text_mtx.m_b, text_mtx.m_c, text_mtx.m_d, text_mtx.m_h, text_mtx.m_v)
241 mtx.Mult(x, y)
242 Console.WriteLine(" Position: x={0:f} y={1:f}", x, y)
243 itr.Next()
244 End While
245
246 Console.WriteLine()
247 End If
248 element = page_reader.Next()
249 End While
250 End Sub
251
252 Sub ProcessImage(ByRef image As Element)
253 Dim image_mask As Boolean = image.IsImageMask()
254 Dim interpolate As Boolean = image.IsImageInterpolate()
255 Dim width As Integer = image.GetImageWidth()
256 Dim height As Integer = image.GetImageHeight()
257 Dim out_data_sz As Integer = width * height * 3
258
259 Console.WriteLine("Image: width=""{0:d}"" height=""{1:d}""", width, height)
260
261 ' Dim mtx As Matrix2D = image.GetCTM() ' image matrix (page positioning info)
262
263 ' You can use GetImageData to read the raw (decoded) image data
264 'image.GetBitsPerComponent()
265 'image.GetImageData() ' get raw image data
266 ' .... or use Image2RGB filter that converts every image to RGB format,
267 ' This should save you time since you don't need to deal with color conversions,
268 ' image up-sampling, decoding etc.
269
270 Dim img_conv As Image2RGB = New Image2RGB(image) ' Extract and convert image to RGB 8-bpc format
271 Dim reader As FilterReader = New FilterReader(img_conv)
272
273 ' A buffer used to keep image data.
274 Dim image_data_out As Byte() = Nothing '= New Byte(out_data_sz)
275
276 reader.Read(image_data_out)
277 ' image_data_out contains RGB image data.
278
279 ' Note that you don't need to read a whole image at a time. Alternatively
280 ' you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
281 ' until the function returns 0.
282 End Sub
283
284 Sub ProcessElements(ByRef reader As ElementReader)
285 Dim element As Element = reader.Next()
286
287 element = reader.Next()
288 While Not IsNothing(element) ' Read page contents
289 If element.GetType() = element.Type.e_path Then
290 ' Process path data...
291 ProcessPath(reader, element)
292 ElseIf element.GetType() = element.Type.e_text_begin Then
293 ' Process text strings...
294 ProcessText(reader)
295 ElseIf element.GetType() = element.Type.e_form Then
296 ' Process form XObjects
297 reader.FormBegin()
298 ProcessElements(reader)
299 reader.End()
300 ElseIf element.GetType() = element.Type.e_image Then
301 ' Process Images
302 ProcessImage(element)
303 End If
304 element = reader.Next()
305 End While
306 End Sub
307
308 Sub Main()
309
310 PDFNet.Initialize(PDFTronLicense.Key)
311
312 ' Relative path to the folder containing test files.
313 Dim input_path As String = "../../../../TestFiles/"
314 ' Dim output_path As String = "../../../../TestFiles/Output/"
315
316 Console.WriteLine("-------------------------------------------------")
317 Console.WriteLine("Extract page element information from all")
318 Console.WriteLine("pages in the document.")
319
320 ' Open the test file
321 Console.WriteLine("Opening the input file...")
322 Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
323 doc.InitSecurityHandler()
324
325 Dim pgnum As Integer = doc.GetPageCount()
326
327 Dim itr As PageIterator
328 Using page_reader As ElementReader = New ElementReader
329 itr = doc.GetPageIterator()
330 While itr.HasNext() ' Read every page
331 Console.WriteLine("Page {0:d} ----------------------------------------", _
332 itr.GetPageNumber())
333
334 Dim crop_box As Rect = itr.Current().GetCropBox()
335 Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2)
336 Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height())
337
338 page_reader.Begin(itr.Current())
339 ProcessElements(page_reader)
340 page_reader.End()
341 itr.Next()
342 End While
343 End Using
344 End Using
345 PDFNet.Terminate()
346 Console.WriteLine("Done.")
347
348 End Sub
349
350End Module
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales