Sample C# code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our UWP SDK and PDF Data Extraction SDK Capabilities.
1//
2// Copyright (c) 2001-2020 by PDFTron Systems Inc. All Rights Reserved.
3//
4
5using System;
6using System.IO;
7using System.Threading.Tasks;
8using Windows.Foundation;
9
10using pdftron.Common;
11using pdftron.PDF;
12using pdftron.SDF;
13
14using PDFNetUniversalSamples.ViewModels;
15
16namespace PDFNetSamples
17{
18 public sealed class ElementReaderAdvTest : Sample
19 {
20 public ElementReaderAdvTest() :
21 base("ElementReaderAdv", "The sample shows how to use some of more advanced PDFNet features. The sample code illustrates how to extract text, paths, and images. The sample also shows how to do color conversion, image normalization, and how to process changes in the graphics state.")
22 {
23 }
24
25 public override IAsyncAction RunAsync()
26 {
27 return Task.Run(new System.Action(() => {
28 WriteLine("--------------------------------");
29 WriteLine("Starting ElementReaderAdv Test...");
30 WriteLine("--------------------------------\n");
31 try
32 {
33 WriteLine("Extract page element information from all pages in the document.");
34 string input_file_path = Path.Combine(InputPath, "newsletter.pdf");
35 WriteLine("Opening input file " + input_file_path);
36 PDFDoc doc = new PDFDoc(input_file_path);
37
38 doc.InitSecurityHandler();
39
40 int pgnum = doc.GetPageCount();
41 PageIterator itr;
42
43 ElementReader page_reader = new ElementReader();
44
45 itr = doc.GetPageIterator(); //Read first page
46 WriteLine(String.Format("Page {0:d} ----------------------------------------", itr.GetPageNumber()));
47
48 pdftron.PDF.Rect crop_box = itr.Current().GetCropBox();
49 crop_box.Normalize();
50
51 WriteLine(String.Format(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2));
52 WriteLine(String.Format(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height()));
53
54 page_reader.Begin(itr.Current());
55 ProcessElements(page_reader);
56 page_reader.End();
57
58 doc.Destroy();
59 WriteLine("\nDone.");
60 }
61 catch (Exception e)
62 {
63 WriteLine(GetExceptionMessage(e));
64 }
65
66 WriteLine("\n--------------------------------");
67 WriteLine("Done ElementReaderAdv Test.");
68 WriteLine("--------------------------------\n");
69 })).AsAsyncAction();
70 }
71
72 String m_buf;
73
74 String ProcessPath(ElementReader reader, Element path)
75 {
76 String result = "";
77 if (path.IsClippingPath())
78 {
79 result += ("This is a clipping path.\n");
80 }
81
82 PathData pathData = path.GetPathData();
83 pathData.get_pts();
84 double[] data = pathData.get_pts();// points;
85 int data_sz = data.Length;
86 //int data_sz = path.
87
88 byte[] opr = pathData.get_ops();//operators;
89 int opr_sz = opr.Length;
90
91 int opr_itr = 0, opr_end = opr_sz;
92 int data_itr = 0, data_end = data_sz;
93 double x1, y1, x2, y2, x3, y3;
94
95 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
96
97 result += (" Path Data Points := \"\n");
98 for (; opr_itr < opr_end; ++opr_itr)
99 {
100 switch ((PathDataPathSegmentType)((int)opr[opr_itr]))
101 {
102 case PathDataPathSegmentType.e_moveto:
103 x1 = data[data_itr]; ++data_itr;
104 y1 = data[data_itr]; ++data_itr;
105 m_buf = String.Format("M{0:g5} {1:g5}", x1, y1);
106 result += (m_buf + "\n");
107 break;
108 case PathDataPathSegmentType.e_lineto:
109 x1 = data[data_itr]; ++data_itr;
110 y1 = data[data_itr]; ++data_itr;
111 m_buf = String.Format(" L{0:g5} {1:g5}", x1, y1);
112 result += (m_buf + "\n");
113 break;
114 case PathDataPathSegmentType.e_cubicto:
115 x1 = data[data_itr]; ++data_itr;
116 y1 = data[data_itr]; ++data_itr;
117 x2 = data[data_itr]; ++data_itr;
118 y2 = data[data_itr]; ++data_itr;
119 x3 = data[data_itr]; ++data_itr;
120 y3 = data[data_itr]; ++data_itr;
121 m_buf = String.Format(" C{0:g5} {1:g5} {2:g5} {3:g5} {4:g5} {5:g5}",
122 new object[] { x1, y1, x2, y2, x3, y3 });
123 result += (m_buf + "\n");
124 break;
125 case PathDataPathSegmentType.e_rect:
126 {
127 x1 = data[data_itr]; ++data_itr;
128 y1 = data[data_itr]; ++data_itr;
129 double w = data[data_itr]; ++data_itr;
130 double h = data[data_itr]; ++data_itr;
131 x2 = x1 + w;
132 y2 = y1;
133 x3 = x2;
134 y3 = y1 + h;
135 double x4 = x1;
136 double y4 = y3;
137 m_buf = String.Format("M{0:g5} {1:g5} L{2:g5} {3:g5} L{4:g5} {5:g5} L{6:g5} {7:g5} Z",
138 new object[] { x1, y1, x2, y2, x3, y3, x4, x3 });
139 result += (m_buf);
140 break;
141 }
142 case PathDataPathSegmentType.e_closepath:
143 result += ("\n Close Path\n");
144 break;
145 default:
146 System.Diagnostics.Debug.Assert(false);
147 break;
148 }
149 }
150
151 result += ("\" ");
152
153 GState gs = path.GetGState();
154
155 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
156 if (path.IsStroked())
157 {
158 result += ("Stroke path\n");
159
160 if (gs.GetStrokeColorSpace().GetType() == ColorSpaceType.e_pattern)
161 {
162 result += ("Path has associated pattern\n");
163 }
164 else
165 {
166 // Get stroke color (you can use PDFNet color conversion facilities)
167 // ColorPt rgb = new ColorPt();
168 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
169 }
170 }
171 else
172 {
173 // Do not stroke path
174 }
175
176 if (path.IsFilled())
177 {
178 result += ("Fill path\n");
179
180 if (gs.GetFillColorSpace().GetType() == ColorSpaceType.e_pattern)
181 {
182 result += ("Path has associated pattern\n");
183 }
184 else
185 {
186 // ColorPt rgb = new ColorPt();
187 // gs.GetFillColorSpace().Convert2RGB(gs.GetFillColor(), rgb);
188 }
189 }
190 else
191 {
192 // Do not fill path
193 }
194
195 // Process any changes in graphics state ---------------------------------
196
197 GSChangesIterator gs_itr = reader.GetChangesIterator();
198 for (; gs_itr.HasNext(); gs_itr.Next())
199 {
200 switch (gs_itr.Current())
201 {
202 case GStateGStateAttribute.e_transform:
203 // Get transform matrix for this element. Unlike path.GetCTM()
204 // that return full transformation matrix gs.GetTransform() return
205 // only the transformation matrix that was installed for this element.
206 //
207 // gs.GetTransform();
208 break;
209 case GStateGStateAttribute.e_line_width:
210 // gs.GetLineWidth();
211 break;
212 case GStateGStateAttribute.e_line_cap:
213 // gs.GetLineCap();
214 break;
215 case GStateGStateAttribute.e_line_join:
216 // gs.GetLineJoin();
217 break;
218 case GStateGStateAttribute.e_flatness:
219 break;
220 case GStateGStateAttribute.e_miter_limit:
221 // gs.GetMiterLimit();
222 break;
223 case GStateGStateAttribute.e_dash_pattern:
224 {
225 // double[] dashes;
226 // gs.GetDashes(dashes);
227 // gs.GetPhase()
228 break;
229 }
230 case GStateGStateAttribute.e_fill_color:
231 {
232 if (gs.GetFillColorSpace().GetType() == ColorSpaceType.e_pattern &&
233 gs.GetFillPattern().GetType() != PatternColorType.e_shading)
234 {
235 //process the pattern data
236 reader.PatternBegin(true);
237 ProcessElements(reader);
238 reader.End();
239 }
240 break;
241 }
242 }
243 }
244 reader.ClearChangeList();
245 return result;
246 }
247
248 String ProcessText(ElementReader page_reader)
249 {
250 String result = "";
251 // Begin text element
252 result += ("Begin Text Block:\n");
253
254 Element element;
255 while ((element = page_reader.Next()) != null)
256 {
257 switch (element.GetType())
258 {
259 case ElementType.e_text_end:
260 // Finish the text block
261 result += ("End Text Block.\n");
262 return result;
263
264 case ElementType.e_text:
265 {
266 GState gs = element.GetGState();
267
268 ColorSpace cs_fill = gs.GetFillColorSpace();
269 ColorPt fill = gs.GetFillColor();
270
271 ColorPt outc = new ColorPt();
272 cs_fill. Convert2RGB(fill, outc);
273
274
275 ColorSpace cs_stroke = gs.GetStrokeColorSpace();
276 ColorPt stroke = gs.GetStrokeColor();
277
278 Font font = gs.GetFont();
279
280 result += ("Font Name: ");
281 result += (font.GetName() + "\n");
282 // font.IsFixedWidth();
283 // font.IsSerif();
284 // font.IsSymbolic();
285 // font.IsItalic();
286 // ...
287
288 // double word_spacing = gs.GetWordSpacing();
289 // double char_spacing = gs.GetCharSpacing();
290
291 // Use element.GetCTM() if you are interested in the CTM
292 // (current transformation matrix).
293 if (font.GetType() == FontType.e_Type3)
294 {
295 //type 3 font, process its data
296 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
297 {
298 page_reader.Type3FontBegin(itr.Current());
299 ProcessElements(page_reader);
300 page_reader.End();
301 }
302 }
303
304 else
305 {
306 Matrix2D ctm = element.GetCTM();
307
308 Matrix2D text_mtx = element.GetTextMatrix();
309
310 Matrix2D mtx = Matrix2D.Mult(ctm, text_mtx);
311 double font_sz_scale_factor = System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
312 double font_size = gs.GetFontSize();
313 result += (String.Format(" Font Size: {0:f}\n", font_sz_scale_factor * font_size));
314
315 ColorPt font_color = gs.GetFillColor();
316 ColorSpace cs = gs.GetFillColorSpace();
317
318 ColorPt rgb = new ColorPt();
319 cs.Convert2RGB(font_color, rgb);
320 //Color font_color_rgb = Color.FromArgb(255, (byte)(rgb.get_c(0)*255),
321 // (byte)(rgb.get_c(1)*255), (byte)(rgb.get_c(2)*255));
322
323 result += (String.Format("Font Color(RGB): red={0:d} green={1:d} blue={2:d}\n",
324 (byte)(rgb.Get(0) * 255),
325 (byte)(rgb.Get(1) * 255),
326 (byte)(rgb.Get(2) * 255)));
327
328 pdftron.Common.DoubleRef x, y;
329 int char_code;
330
331 for (CharIterator itr = element.GetCharIterator(); itr.HasNext(); itr.Next())
332 {
333 result += ("Character code: ");
334 char_code = itr.Current().char_code;
335 result += (String.Format("{0}\n", (char)char_code));
336
337 x = new pdftron.Common.DoubleRef(itr.Current().x); // character positioning information
338 y = new pdftron.Common.DoubleRef(itr.Current().y);
339
340 // To get the exact character positioning information you need to
341 // concatenate current text matrix with CTM and then multiply
342 // relative positioning coordinates with the resulting matrix.
343 //
344 mtx = Matrix2D.Mult(ctm, text_mtx);
345 mtx.Mult(x, y);
346
347 result += (String.Format(" Position: x={0:f} y={1:f}\n", x.Value, y.Value));
348 }
349 }
350 break;
351 }
352 }
353 }
354 return result;
355 }
356
357 int image_counter = 0;
358
359 String ProcessImage(Element image)
360 {
361 String result = "";
362 bool image_mask = image.IsImageMask();
363 bool interpolate = image.IsImageInterpolate();
364 int width = image.GetImageWidth();
365 int height = image.GetImageHeight();
366 int out_data_sz = width * height * 3;
367
368 result += (String.Format("\nImage: width=\"{0:d}\" height=\"{1:d}\"\n", width, height));
369
370 // Matrix2D mtx = image.GetCTM(); // image matrix (page positioning info)
371
372 ++image_counter;
373 /*
374 System.Drawing.Bitmap bmp = image.GetBitmap();
375 bmp.Save(output_path + "reader_img_extract_" + image_counter.ToString() + ".png", System.Drawing.Imaging.ImageFormat.Png);
376 */
377
378 // Alternatively you can use GetImageData to read the raw (decoded) image data
379 // image.GetBitsPerComponent();
380 // image.GetImageData(); // get raw image data
381 // another approach is to use Image2RGB filter that converts every image to
382 // RGB format. This could save you time since you don't need to deal with color
383 // conversions, image up-sampling, decoding etc.
384 // ----------------
385 // Image2RGB img_conv = new Image2RGB(image); // Extract and convert image to RGB 8-bpc format
386 // FilterReader reader = new FilterReader(img_conv); //
387 // byte[] image_data_out = new byte[out_data_sz]; // A buffer used to keep image data.
388 // reader.Read(image_data_out); // image_data_out contains RGB image data.
389 // ----------------
390 // Note that you don't need to read a whole image at a time. Alternatively
391 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
392 // until the function returns 0.
393 return result;
394 }
395
396 String ProcessElements(ElementReader reader)
397 {
398 String resultMsg = "";
399 Element element;
400 while ((element = reader.Next()) != null) // Read page contents
401 {
402 switch (element.GetType())
403 {
404 case ElementType.e_path: // Process path data...
405 {
406 resultMsg += ProcessPath(reader, element) + "\n";
407 break;
408 }
409 case ElementType.e_text_begin: // Process text strings...
410 {
411 resultMsg += ProcessText(reader) + "\n";
412 break;
413 }
414 case ElementType.e_form: // Process form XObjects
415 {
416 reader.FormBegin();
417 resultMsg += ProcessElements(reader) + "\n";
418 reader.End();
419 break;
420 }
421 case ElementType.e_image: // Process Images
422 {
423 resultMsg += ProcessImage(element) + "\n";
424 break;
425 }
426 }
427 }
428 // Print result msg
429 Write(resultMsg);
430
431 return "";
432 }
433 }
434}
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales