ElementReaderAdv

Sample Obj-C code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Learn more about our iOS SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9char m_buf[4000];
10
11void ProcessElements(PTElementReader *reader);
12
13void ProcessPath(PTElementReader *reader, PTElement *path)
14{
15 if ([path IsClippingPath])
16 {
17 NSLog(@"This is a clipping path");
18 }
19
20 PTPathData* pathData = [path GetPathData];
21 NSMutableArray* data = [pathData GetPoints];
22 NSData* opr = [pathData GetOperators];
23
24 NSUInteger opr_index = 0;
25 NSUInteger opr_end = opr.length;
26 NSUInteger data_index = 0;
27 NSUInteger data_end = data.count;
28
29 double x1, y1, x2, y2, x3, y3;
30 NSString *str = @"";
31
32 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
33
34 unsigned char* opr_data = (unsigned char*)opr.bytes;
35 str = [str stringByAppendingFormat: @" Path Data Points := \""];
36 for (; opr_index<opr_end; opr_index = opr_index + 1)
37 {
38 switch(opr_data[opr_index])
39 {
40 case e_ptmoveto:
41 x1 = [data[data_index] doubleValue]; ++data_index;
42 y1 = [data[data_index] doubleValue]; ++data_index;
43 sprintf(m_buf, "M%.5g %.5g", x1, y1);
44 str = [str stringByAppendingFormat: @"%s", m_buf];
45 break;
46 case e_ptlineto:
47 x1 = [data[data_index] doubleValue]; ++data_index;
48 y1 = [data[data_index] doubleValue]; ++data_index;
49 sprintf(m_buf, " L%.5g %.5g", x1, y1);
50 str = [str stringByAppendingFormat: @"%s", m_buf];
51 break;
52 case e_ptcubicto:
53 x1 = [data[data_index] doubleValue]; ++data_index;
54 y1 = [data[data_index] doubleValue]; ++data_index;
55 x2 = [data[data_index] doubleValue]; ++data_index;
56 y2 = [data[data_index] doubleValue]; ++data_index;
57 x3 = [data[data_index] doubleValue]; ++data_index;
58 y3 = [data[data_index] doubleValue]; ++data_index;
59 sprintf(m_buf, " C%.5g %.5g %.5g %.5g %.5g %.5g", x1, y1, x2, y2, x3, y3);
60 str = [str stringByAppendingFormat: @"%s", m_buf];
61 break;
62 case e_ptrect:
63 {
64 x1 = [data[data_index] doubleValue]; ++data_index;
65 y1 = [data[data_index] doubleValue]; ++data_index;
66 double w = [data[data_index] doubleValue]; ++data_index;
67 double h = [data[data_index] doubleValue]; ++data_index;
68 x2 = x1 + w;
69 y2 = y1;
70 x3 = x2;
71 y3 = y1 + h;
72 double x4 = x1;
73 double y4 = y3;
74 sprintf(m_buf, "M%.5g %.5g L%.5g %.5g L%.5g %.5g L%.5g %.5g Z",
75 x1, y1, x2, y2, x3, y3, x4, y4);
76 str = [str stringByAppendingFormat: @"%s", m_buf];
77 }
78 break;
79 case e_ptclosepath:
80 str = [str stringByAppendingString: @" Close Path"];
81 break;
82 default:
83 assert(false);
84 break;
85 }
86 }
87
88 str = [str stringByAppendingString: @"\" "];
89
90 PTGState *gs = [path GetGState];
91
92 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
93 if ([path IsStroked])
94 {
95 str = [str stringByAppendingString: @"Stroke path\n"];
96
97 if ([[gs GetStrokeColorSpace] GetType] == e_ptpattern)
98 {
99 str = [str stringByAppendingString: @"Path has associated pattern"];
100 }
101 else
102 {
103 // Get stroke color (you can use PDFNet color conversion facilities)
104 // ColorPt rgb;
105 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
106 }
107 }
108 else
109 {
110 // Do not stroke path
111 }
112
113 if ([path IsFilled])
114 {
115 str = [str stringByAppendingString: @"Fill path"];
116
117 if ([[gs GetFillColorSpace] GetType] == e_ptpattern)
118 {
119 str = [str stringByAppendingString: @"Path has associated pattern"];
120 }
121 else
122 {
123 // PTColorPt *rgb = [[[PTColorPt alloc] init] autorelease];
124 // [[gs GetFillColorSpace] Convert2RGB: [gs GetFillColorWithColorPt: rgb]];
125 }
126 }
127 else
128 {
129 // Do not fill path
130 }
131
132 // Process any changes in graphics state ---------------------------------
133
134 PTGSChangesIterator *gs_itr = [reader GetChangesIterator];
135 for (; [gs_itr HasNext]; [gs_itr Next])
136 {
137 switch([gs_itr Current])
138 {
139 case e_pttransform :
140 // Get transform matrix for this element. Unlike path.GetCTM()
141 // that return full transformation matrix gs.GetTransform() return
142 // only the transformation matrix that was installed for this element.
143 //
144 // gs.GetTransform();
145 break;
146 case e_ptline_width :
147 // gs.GetLineWidth();
148 break;
149 case e_ptline_cap :
150 // gs.GetLineCap();
151 break;
152 case e_ptline_join :
153 // gs.GetLineJoin();
154 break;
155 case e_ptflatness :
156 break;
157 case e_ptmiter_limit :
158 // gs.GetMiterLimit();
159 break;
160 case e_ptdash_pattern :
161 {
162 // std::vector<double> dashes;
163 // gs.GetDashes(dashes);
164 // gs.GetPhase()
165 }
166 break;
167 case e_ptfill_color:
168 {
169 if ( [[gs GetFillColorSpace] GetType] == e_ptpattern &&
170 [[gs GetFillPattern] GetType] != e_ptshading )
171 {
172 //process the pattern data
173 [reader PatternBegin: YES reset_ctm_tfm: NO];
174 ProcessElements(reader);
175 [reader End];
176 }
177 }
178 break;
179 default:
180 break;
181 }
182 }
183 [reader ClearChangeList];
184 NSLog(@"%@", str);
185}
186
187void ProcessText(PTElementReader* page_reader)
188{
189 // Begin text element
190 NSLog(@"Begin Text Block:");
191
192 PTElement *element;
193 while ((element = [page_reader Next]) != NULL)
194 {
195 switch ([element GetType])
196 {
197 case e_pttext_end:
198 // Finish the text block
199 //str = [str stringByAppendingString: @"End Text Block.\n"];
200 NSLog(@"End Text Block.");
201 return;
202
203 case e_pttext_obj:
204 {
205 PTGState *gs = [element GetGState];
206
207 PTColorSpace *cs_fill = [gs GetFillColorSpace];
208 PTColorPt *fill = [gs GetFillColor];
209
210 PTColorPt *outColor = [cs_fill Convert2RGB: fill];
211
212 PTColorSpace *cs_stroke = [gs GetStrokeColorSpace];
213 PTColorPt *stroke = [gs GetStrokeColor];
214
215 PTFont *font = [gs GetFont];
216
217 NSLog(@"Font Name: %@\n", [font GetName]);
218
219 // font.IsFixedWidth();
220 // font.IsSerif();
221 // font.IsSymbolic();
222 // font.IsItalic();
223 // ...
224
225 // double font_size = gs.GetFontSize();
226 // double word_spacing = gs.GetWordSpacing();
227 // double char_spacing = gs.GetCharSpacing();
228 // const UString* txt = element.GetTextString();
229
230 if ( [font GetType] == e_ptType3 )
231 {
232 //type 3 font, process its data
233 PTCharIterator *itr;
234 for (itr = [element GetCharIterator]; [itr HasNext]; [itr Next])
235 {
236 [page_reader Type3FontBegin: [itr Current] resource_dict: 0];
237 ProcessElements(page_reader);
238 [page_reader End];
239 }
240 }
241
242 else
243 {
244 PTMatrix2D *text_mtx = [element GetTextMatrix];
245 double x, y;
246 unsigned int char_code;
247
248 PTCharIterator *itr;
249 NSString* str = @"";
250 for (itr = [element GetCharIterator]; [itr HasNext]; [itr Next])
251 {
252 char_code = [[itr Current] getChar_code];
253 if (char_code>=32 || char_code<=255) { // Print if in ASCII range...
254 str = [str stringByAppendingFormat: @"%c", char_code];
255 }
256
257 x = [[itr Current] getX]; // character positioning information
258 y = [[itr Current] getY];
259
260 // Use element.GetCTM() if you are interested in the CTM
261 // (current transformation matrix).
262 PTMatrix2D *ctm = [element GetCTM];
263
264 // To get the exact character positioning information you need to
265 // concatenate current text matrix with CTM and then multiply
266 // relative positioning coordinates with the resulting matrix.
267 PTMatrix2D *mtx = text_mtx;
268 [mtx Concat: [ctm getM_a] b: [ctm getM_b] c: [ctm getM_c] d: [ctm getM_d] h: [ctm getM_h] v: [ctm getM_v]];
269 [mtx Mult: [[PTPDFPoint alloc] initWithPx: x py: y]];
270
271 // Get glyph path...
272 //vector<UChar> oprs;
273 //vector<double> glyph_data;
274 //font.GetGlyphPath(char_code, oprs, glyph_data, false, 0);
275 }
276 NSLog(@"%@", str);
277 }
278
279 //str = [str stringByAppendingString: @"\n"];
280 }
281 break;
282 default:
283 break;
284 }
285 }
286}
287
288void ProcessImage(PTElement *image)
289{
290 bool image_mask = [image IsImageMask];
291 bool interpolate = [image IsImageInterpolate];
292 int width = [image GetImageWidth];
293 int height = [image GetImageHeight];
294 int out_data_sz = width * height * 3;
295
296 NSLog(@"Image: width=\"%d\" height=\"%d\"", width, height);
297
298 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
299
300 // You can use GetImageData to read the raw (decoded) image data
301 //image->GetBitsPerComponent();
302 //image->GetImageData(); // get raw image data
303 // .... or use Image2RGB filter that converts every image to RGB format,
304 // This should save you time since you don't need to deal with color conversions,
305 // image up-sampling, decoding etc.
306
307 PTImage2RGB *img_conv = [[PTImage2RGB alloc] initWithImage_element: image]; // Extract and convert image to RGB 8-bpc format
308 PTFilterReader *reader = [[PTFilterReader alloc] initWithFilter: img_conv];
309
310 // A buffer used to keep image data.
311 NSData *image_data_out = [reader Read: out_data_sz];
312 // &image_data_out.front() contains RGB image data.
313
314 // Note that you don't need to read a whole image at a time. Alternatively
315 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
316 // until the function returns 0.
317}
318
319void ProcessElements(PTElementReader *reader)
320{
321 PTElement *element;
322 while ((element = [reader Next]) != NULL) // Read page contents
323 {
324 switch ([element GetType])
325 {
326 case e_ptpath: // Process path data...
327 {
328 ProcessPath(reader, element);
329 }
330 break;
331 case e_pttext_begin: // Process text block...
332 {
333 ProcessText(reader);
334 }
335 break;
336 case e_ptform: // Process form XObjects
337 {
338 [reader FormBegin];
339 ProcessElements(reader);
340 [reader End];
341 }
342 break;
343 case e_ptimage: // Process Images
344 {
345 ProcessImage(element);
346 }
347 break;
348
349 default:
350 break;
351 }
352
353 }
354}
355
356int main(int argc, char *argv[])
357{
358 @autoreleasepool {
359 int ret = 0;
360 [PTPDFNet Initialize: 0];
361
362 @try // Extract text data from all pages in the document
363 {
364 NSLog(@"__________________________________________________");
365 NSLog(@"Extract page element information from all ");
366 NSLog(@"pages in the document.");
367
368 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
369 [doc InitSecurityHandler];
370
371 int pgnum = [doc GetPageCount];
372 PTPageIterator *page_begin = [doc GetPageIterator: 1];
373
374 PTElementReader *page_reader = [[PTElementReader alloc] init];
375
376 PTPageIterator *itr;
377 for (itr = page_begin; [itr HasNext]; [itr Next]) // Read every page
378 {
379 NSLog(@"Page %d----------------------------------------", [[itr Current] GetIndex]);
380 [page_reader Begin: [itr Current]];
381 ProcessElements(page_reader);
382 [page_reader End];
383 }
384
385 NSLog(@"Done.");
386 }
387 @catch(NSException *e)
388 {
389 NSLog(@"%@", e.reason);
390 ret = 1;
391 }
392 [PTPDFNet Terminate: 0];
393 return ret;
394 }
395
396}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales