ElementReaderAdv

Sample code in Swift and Obj-C for using Apryse iOS SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with iOS SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#import <OBJC/PDFNetOBJC.h>
7#import <Foundation/Foundation.h>
8
9char m_buf[4000];
10
11void ProcessElements(PTElementReader *reader);
12
13void ProcessPath(PTElementReader *reader, PTElement *path)
14{
15 if ([path IsClippingPath])
16 {
17 NSLog(@"This is a clipping path");
18 }
19
20 PTPathData* pathData = [path GetPathData];
21 NSMutableArray* data = [pathData GetPoints];
22 NSData* opr = [pathData GetOperators];
23
24 NSUInteger opr_index = 0;
25 NSUInteger opr_end = opr.length;
26 NSUInteger data_index = 0;
27 NSUInteger data_end = data.count;
28
29 double x1, y1, x2, y2, x3, y3;
30 NSString *str = @"";
31
32 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
33
34 unsigned char* opr_data = (unsigned char*)opr.bytes;
35 str = [str stringByAppendingFormat: @" Path Data Points := \""];
36 for (; opr_index<opr_end; opr_index = opr_index + 1)
37 {
38 switch(opr_data[opr_index])
39 {
40 case e_ptmoveto:
41 x1 = [data[data_index] doubleValue]; ++data_index;
42 y1 = [data[data_index] doubleValue]; ++data_index;
43 sprintf(m_buf, "M%.5g %.5g", x1, y1);
44 str = [str stringByAppendingFormat: @"%s", m_buf];
45 break;
46 case e_ptlineto:
47 x1 = [data[data_index] doubleValue]; ++data_index;
48 y1 = [data[data_index] doubleValue]; ++data_index;
49 sprintf(m_buf, " L%.5g %.5g", x1, y1);
50 str = [str stringByAppendingFormat: @"%s", m_buf];
51 break;
52 case e_ptcubicto:
53 x1 = [data[data_index] doubleValue]; ++data_index;
54 y1 = [data[data_index] doubleValue]; ++data_index;
55 x2 = [data[data_index] doubleValue]; ++data_index;
56 y2 = [data[data_index] doubleValue]; ++data_index;
57 x3 = [data[data_index] doubleValue]; ++data_index;
58 y3 = [data[data_index] doubleValue]; ++data_index;
59 sprintf(m_buf, " C%.5g %.5g %.5g %.5g %.5g %.5g", x1, y1, x2, y2, x3, y3);
60 str = [str stringByAppendingFormat: @"%s", m_buf];
61 break;
62 case e_ptrect:
63 {
64 x1 = [data[data_index] doubleValue]; ++data_index;
65 y1 = [data[data_index] doubleValue]; ++data_index;
66 double w = [data[data_index] doubleValue]; ++data_index;
67 double h = [data[data_index] doubleValue]; ++data_index;
68 x2 = x1 + w;
69 y2 = y1;
70 x3 = x2;
71 y3 = y1 + h;
72 double x4 = x1;
73 double y4 = y3;
74 sprintf(m_buf, "M%.5g %.5g L%.5g %.5g L%.5g %.5g L%.5g %.5g Z",
75 x1, y1, x2, y2, x3, y3, x4, y4);
76 str = [str stringByAppendingFormat: @"%s", m_buf];
77 }
78 break;
79 case e_ptclosepath:
80 str = [str stringByAppendingString: @" Close Path"];
81 break;
82 default:
83 assert(false);
84 break;
85 }
86 }
87
88 str = [str stringByAppendingString: @"\" "];
89
90 PTGState *gs = [path GetGState];
91
92 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
93 if ([path IsStroked])
94 {
95 str = [str stringByAppendingString: @"Stroke path\n"];
96
97 if ([[gs GetStrokeColorSpace] GetType] == e_ptpattern)
98 {
99 str = [str stringByAppendingString: @"Path has associated pattern"];
100 }
101 else
102 {
103 // Get stroke color (you can use PDFNet color conversion facilities)
104 // ColorPt rgb;
105 // gs.GetStrokeColorSpace().Convert2RGB(gs.GetStrokeColor(), rgb);
106 }
107 }
108 else
109 {
110 // Do not stroke path
111 }
112
113 if ([path IsFilled])
114 {
115 str = [str stringByAppendingString: @"Fill path"];
116
117 if ([[gs GetFillColorSpace] GetType] == e_ptpattern)
118 {
119 str = [str stringByAppendingString: @"Path has associated pattern"];
120 }
121 else
122 {
123 // PTColorPt *rgb = [[[PTColorPt alloc] init] autorelease];
124 // [[gs GetFillColorSpace] Convert2RGB: [gs GetFillColorWithColorPt: rgb]];
125 }
126 }
127 else
128 {
129 // Do not fill path
130 }
131
132 // Process any changes in graphics state ---------------------------------
133
134 PTGSChangesIterator *gs_itr = [reader GetChangesIterator];
135 for (; [gs_itr HasNext]; [gs_itr Next])
136 {
137 switch([gs_itr Current])
138 {
139 case e_pttransform :
140 // Get transform matrix for this element. Unlike path.GetCTM()
141 // that return full transformation matrix gs.GetTransform() return
142 // only the transformation matrix that was installed for this element.
143 //
144 // gs.GetTransform();
145 break;
146 case e_ptline_width :
147 // gs.GetLineWidth();
148 break;
149 case e_ptline_cap :
150 // gs.GetLineCap();
151 break;
152 case e_ptline_join :
153 // gs.GetLineJoin();
154 break;
155 case e_ptflatness :
156 break;
157 case e_ptmiter_limit :
158 // gs.GetMiterLimit();
159 break;
160 case e_ptdash_pattern :
161 {
162 // std::vector<double> dashes;
163 // gs.GetDashes(dashes);
164 // gs.GetPhase()
165 }
166 break;
167 case e_ptfill_color:
168 {
169 if ( [[gs GetFillColorSpace] GetType] == e_ptpattern &&
170 [[gs GetFillPattern] GetType] != e_ptshading )
171 {
172 //process the pattern data
173 [reader PatternBegin: YES reset_ctm_tfm: NO];
174 ProcessElements(reader);
175 [reader End];
176 }
177 }
178 break;
179 default:
180 break;
181 }
182 }
183 [reader ClearChangeList];
184 NSLog(@"%@", str);
185}
186
187void ProcessText(PTElementReader* page_reader)
188{
189 // Begin text element
190 NSLog(@"Begin Text Block:");
191
192 PTElement *element;
193 while ((element = [page_reader Next]) != NULL)
194 {
195 switch ([element GetType])
196 {
197 case e_pttext_end:
198 // Finish the text block
199 //str = [str stringByAppendingString: @"End Text Block.\n"];
200 NSLog(@"End Text Block.");
201 return;
202
203 case e_pttext_obj:
204 {
205 PTGState *gs = [element GetGState];
206
207 PTColorSpace *cs_fill = [gs GetFillColorSpace];
208 PTColorPt *fill = [gs GetFillColor];
209
210 PTColorPt *outColor = [cs_fill Convert2RGB: fill];
211
212 PTColorSpace *cs_stroke = [gs GetStrokeColorSpace];
213 PTColorPt *stroke = [gs GetStrokeColor];
214
215 PTFont *font = [gs GetFont];
216
217 NSLog(@"Font Name: %@\n", [font GetName]);
218
219 // font.IsFixedWidth();
220 // font.IsSerif();
221 // font.IsSymbolic();
222 // font.IsItalic();
223 // ...
224
225 // double font_size = gs.GetFontSize();
226 // double word_spacing = gs.GetWordSpacing();
227 // double char_spacing = gs.GetCharSpacing();
228 // const UString* txt = element.GetTextString();
229
230 if ( [font GetType] == e_ptType3 )
231 {
232 //type 3 font, process its data
233 PTCharIterator *itr;
234 for (itr = [element GetCharIterator]; [itr HasNext]; [itr Next])
235 {
236 [page_reader Type3FontBegin: [itr Current] resource_dict: 0];
237 ProcessElements(page_reader);
238 [page_reader End];
239 }
240 }
241
242 else
243 {
244 PTMatrix2D *text_mtx = [element GetTextMatrix];
245 double x, y;
246 unsigned int char_code;
247
248 PTCharIterator *itr;
249 NSString* str = @"";
250 for (itr = [element GetCharIterator]; [itr HasNext]; [itr Next])
251 {
252 char_code = [[itr Current] getChar_code];
253 if (char_code>=32 || char_code<=255) { // Print if in ASCII range...
254 str = [str stringByAppendingFormat: @"%c", char_code];
255 }
256
257 x = [[itr Current] getX]; // character positioning information
258 y = [[itr Current] getY];
259
260 // Use element.GetCTM() if you are interested in the CTM
261 // (current transformation matrix).
262 PTMatrix2D *ctm = [element GetCTM];
263
264 // To get the exact character positioning information you need to
265 // concatenate current text matrix with CTM and then multiply
266 // relative positioning coordinates with the resulting matrix.
267 PTMatrix2D *mtx = text_mtx;
268 [mtx Concat: [ctm getM_a] b: [ctm getM_b] c: [ctm getM_c] d: [ctm getM_d] h: [ctm getM_h] v: [ctm getM_v]];
269 [mtx Mult: [[PTPDFPoint alloc] initWithPx: x py: y]];
270
271 // Get glyph path...
272 //vector<UChar> oprs;
273 //vector<double> glyph_data;
274 //font.GetGlyphPath(char_code, oprs, glyph_data, false, 0);
275 }
276 NSLog(@"%@", str);
277 }
278
279 //str = [str stringByAppendingString: @"\n"];
280 }
281 break;
282 default:
283 break;
284 }
285 }
286}
287
288void ProcessImage(PTElement *image)
289{
290 bool image_mask = [image IsImageMask];
291 bool interpolate = [image IsImageInterpolate];
292 int width = [image GetImageWidth];
293 int height = [image GetImageHeight];
294 int out_data_sz = width * height * 3;
295
296 NSLog(@"Image: width=\"%d\" height=\"%d\"", width, height);
297
298 // Matrix2D& mtx = image->GetCTM(); // image matrix (page positioning info)
299
300 // You can use GetImageData to read the raw (decoded) image data
301 //image->GetBitsPerComponent();
302 //image->GetImageData(); // get raw image data
303 // .... or use Image2RGB filter that converts every image to RGB format,
304 // This should save you time since you don't need to deal with color conversions,
305 // image up-sampling, decoding etc.
306
307 PTImage2RGB *img_conv = [[PTImage2RGB alloc] initWithImage_element: image]; // Extract and convert image to RGB 8-bpc format
308 PTFilterReader *reader = [[PTFilterReader alloc] initWithFilter: img_conv];
309
310 // A buffer used to keep image data.
311 NSData *image_data_out = [reader Read: out_data_sz];
312 // &image_data_out.front() contains RGB image data.
313
314 // Note that you don't need to read a whole image at a time. Alternatively
315 // you can read a chuck at a time by repeatedly calling reader.Read(buf, buf_sz)
316 // until the function returns 0.
317}
318
319void ProcessElements(PTElementReader *reader)
320{
321 PTElement *element;
322 while ((element = [reader Next]) != NULL) // Read page contents
323 {
324 switch ([element GetType])
325 {
326 case e_ptpath: // Process path data...
327 {
328 ProcessPath(reader, element);
329 }
330 break;
331 case e_pttext_begin: // Process text block...
332 {
333 ProcessText(reader);
334 }
335 break;
336 case e_ptform: // Process form XObjects
337 {
338 [reader FormBegin];
339 ProcessElements(reader);
340 [reader End];
341 }
342 break;
343 case e_ptimage: // Process Images
344 {
345 ProcessImage(element);
346 }
347 break;
348
349 default:
350 break;
351 }
352
353 }
354}
355
356int main(int argc, char *argv[])
357{
358 @autoreleasepool {
359 int ret = 0;
360 [PTPDFNet Initialize: 0];
361
362 @try // Extract text data from all pages in the document
363 {
364 NSLog(@"__________________________________________________");
365 NSLog(@"Extract page element information from all ");
366 NSLog(@"pages in the document.");
367
368 PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/newsletter.pdf"];
369 [doc InitSecurityHandler];
370
371 int pgnum = [doc GetPageCount];
372 PTPageIterator *page_begin = [doc GetPageIterator: 1];
373
374 PTElementReader *page_reader = [[PTElementReader alloc] init];
375
376 PTPageIterator *itr;
377 for (itr = page_begin; [itr HasNext]; [itr Next]) // Read every page
378 {
379 NSLog(@"Page %d----------------------------------------", [[itr Current] GetIndex]);
380 [page_reader Begin: [itr Current]];
381 ProcessElements(page_reader);
382 [page_reader End];
383 }
384
385 NSLog(@"Done.");
386 }
387 @catch(NSException *e)
388 {
389 NSLog(@"%@", e.reason);
390 ret = 1;
391 }
392 [PTPDFNet Terminate: 0];
393 return ret;
394 }
395
396}

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales