PDF Data Extraction - Images, Text, Paths - PHP Sample Code

Sample code for using Apryse SDK to extract text, paths, and images from a PDF. The sample also shows how to do color conversion, image normalization, and process changes in the graphics state. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1<?php
2#---------------------------------------------------------------------------------------
3# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4# Consult LICENSE.txt regarding license information.
5#---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10function ProcessPath($reader, $path)
11{
12 if ($path->IsClippingPath())
13 {
14 echo nl2br("This is a clipping path\n");
15 }
16
17 $pathData = $path->GetPathData();
18 $data = $pathData->GetPoints();
19 $opr = $pathData->GetOperators();
20
21 $opr_index = 0;
22 $opr_end = count((array)$opr);
23 $data_index = 0;
24 $data_end = count($data);
25
26 // Use path.GetCTM() if you are interested in CTM (current transformation matrix).
27
28 echo " Path Data Points := \"";
29 for (; $opr_index<$opr_end; ++$opr_index)
30 {
31 switch($opr[$opr_index])
32 {
33 case PathData::e_moveto:
34 $x1 = $data[$data_index]; ++$data_index;
35 $y1 = $data[$data_index]; ++$data_index;
36 $m_buf = sprintf("M%.5g %.5g", $x1, $y1);
37 echo $m_buf;
38 break;
39 case PathData::e_lineto:
40 $x1 = $data[$data_index]; ++$data_index;
41 $y1 = $data[$data_index]; ++$data_index;
42 $m_buf = sprintf(" L%.5g %.5g", $x1, $y1);
43 echo $m_buf;
44 break;
45 case PathData::e_cubicto:
46 $x1 = $data[$data_index]; ++$data_index;
47 $y1 = $data[$data_index]; ++$data_index;
48 $x2 = $data[$data_index]; ++$data_index;
49 $y2 = $data[$data_index]; ++$data_index;
50 $x3 = $data[$data_index]; ++$data_index;
51 $y3 = $data[$data_index]; ++$data_index;
52 $m_buf = sprintf(" C%.5g %.5g %.5g %.5g %.5g %.5g", $x1, $y1, $x2, $y2, $x3, $y3);
53 echo $m_buf;
54 break;
55 case PathData::e_rect:
56 {
57 $x1 = $data[$data_index]; ++$data_index;
58 $y1 = $data[$data_index]; ++$data_index;
59 $w = $data[$data_index]; ++$data_index;
60 $h = $data[$data_index]; ++$data_index;
61 $x2 = $x1 + $w;
62 $y2 = $y1;
63 $x3 = $x2;
64 $y3 = $y1 + $h;
65 $x4 = $x1;
66 $y4 = $y3;
67 $m_buf = sprintf("M%.5g %.5g L%.5g %.5g L%.5g %.5g L%.5g %.5g Z",
68 $x1, $y1, $x2, $y2, $x3, $y3, $x4, $y4);
69 echo $m_buf;
70 }
71 break;
72 case PathData::e_closepath:
73 echo nl2br(" Close Path\n");
74 break;
75 default:
76 //assert(false);
77 break;
78 }
79 }
80
81 echo "\" ";
82
83 $gs = $path->GetGState();
84
85 // Set Path State 0 (stroke, fill, fill-rule) -----------------------------------
86 if ($path->IsStroked())
87 {
88 echo nl2br("Stroke path\n");
89
90 if ($gs->GetStrokeColorSpace()->GetType() == ColorSpace::e_pattern)
91 {
92 echo nl2br("Path has associated pattern\n");
93 }
94 else
95 {
96 // Get stroke color (you can use PDFNet color conversion facilities)
97 // $rgb = $gs->GetStrokeColorSpace()->Convert2RGB($gs->GetStrokeColor());
98 }
99 }
100 else
101 {
102 // Do not stroke path
103 }
104
105 if ($path->IsFilled())
106 {
107 echo nl2br("Fill path\n");
108
109 if ($gs->GetFillColorSpace()->GetType() == ColorSpace::e_pattern)
110 {
111 echo nl2br("Path has associated pattern\n");
112 }
113 else
114 {
115 // $rgb = $gs->GetFillColorSpace()->Convert2RGB($gs->GetFillColor());
116 }
117 }
118 else
119 {
120 // Do not fill path
121 }
122
123 // Process any changes in graphics state ---------------------------------
124
125 $gs_itr = $reader->GetChangesIterator();
126 for (; $gs_itr->HasNext(); $gs_itr->Next())
127 {
128 switch($gs_itr->Current())
129 {
130 case GState::e_transform :
131 // Get transform matrix for this element. Unlike path.GetCTM()
132 // that return full transformation matrix gs.GetTransform() return
133 // only the transformation matrix that was installed for this element.
134 //
135 // $gs->GetTransform();
136 break;
137 case GState::e_line_width :
138 // $gs->GetLineWidth();
139 break;
140 case GState::e_line_cap :
141 // $gs->GetLineCap();
142 break;
143 case GState::e_line_join :
144 // $gs->GetLineJoin();
145 break;
146 case GState::e_flatness :
147 break;
148 case GState::e_miter_limit :
149 // $gs->GetMiterLimit();
150 break;
151 case GState::e_dash_pattern :
152 {
153 // $dashes = $gs->GetDashes($dashes);
154 // $gs->GetPhase()
155 }
156 break;
157 case GState::e_fill_color:
158 {
159 if ( $gs->GetFillColorSpace()->GetType() == ColorSpace::e_pattern &&
160 $gs->GetFillPattern()->GetType() != PatternColor::e_shading )
161 {
162 //process the pattern data
163 $reader->PatternBegin(true);
164 ProcessElements($reader);
165 $reader->End();
166 }
167 }
168 break;
169 }
170 }
171 $reader->ClearChangeList();
172}
173
174function ProcessText($page_reader)
175{
176 // Begin text element
177 echo nl2br("Begin Text Block:\n");
178
179 while (($element = $page_reader->Next()) != NULL)
180 {
181 switch ($element->GetType())
182 {
183 case Element::e_text_end:
184 // Finish the text block
185 echo nl2br("End Text Block.\n");
186 return;
187
188 case Element::e_text:
189 {
190 $gs = $element->GetGState();
191
192 $cs_fill = $gs->GetFillColorSpace();
193 $fill = $gs->GetFillColor();
194
195 $out = $cs_fill->Convert2RGB($fill);
196
197 $cs_stroke = $gs->GetStrokeColorSpace();
198 $stroke = $gs->GetStrokeColor();
199
200 $font = $gs->GetFont();
201
202 echo nl2br("Font Name: ".$font->GetName()."\n");
203 // $font->IsFixedWidth();
204 // $font->IsSerif();
205 // $font->IsSymbolic();
206 // $font->IsItalic();
207 // ...
208
209 // $font_size = $gs->GetFontSize();
210 // $word_spacing = $gs->GetWordSpacing();
211 // $char_spacing = $gs->GetCharSpacing();
212 // $txt = $element->GetTextString();
213
214 if ( $font->GetType() == Font::e_Type3 )
215 {
216 //type 3 font, process its data
217 for ($itr = $element->GetCharIterator(); $itr->HasNext(); $itr->Next())
218 {
219 $page_reader->Type3FontBegin($itr->Current());
220 ProcessElements($page_reader);
221 $page_reader->End();
222 }
223 }
224
225 else
226 {
227 $text_mtx = $element->GetTextMatrix();
228
229 for ($itr = $element->GetCharIterator(); $itr->HasNext(); $itr->Next())
230 {
231 $char_code = $itr->Current()->char_code;
232 if ($char_code>=32 || $char_code<=255) { // Print if in ASCII range...
233 echo chr($char_code);
234 }
235
236 $x = $itr->Current()->x; // character positioning information
237 $y = $itr->Current()->y;
238 $pt = new Point($x, $y);
239
240 // Use element.GetCTM() if you are interested in the CTM
241 // (current transformation matrix).
242 $ctm = $element->GetCTM();
243
244 // To get the exact character positioning information you need to
245 // concatenate current text matrix with CTM and then multiply
246 // relative positioning coordinates with the resulting matrix.
247 $mtx = $text_mtx;
248 $mtx->Concat($ctm->m_a, $ctm->m_b, $ctm->m_c, $ctm->m_d, $ctm->m_h, $ctm->m_v);
249 $mtx->Mult($pt);
250
251 // Get glyph path...
252 //$glyphPath = font.GetGlyphPath($char_code, false, 0);
253 //$oprs = $glyphPath->GetOperators();
254 //$glyph_data = $glyphPath->GetDataPoints();
255 }
256 }
257
258 echo nl2br("\n");
259 }
260 break;
261 }
262 }
263}
264
265function ProcessImage($image)
266{
267 $image_mask = $image->IsImageMask();
268 $interpolate = $image->IsImageInterpolate();
269 $width = $image->GetImageWidth();
270 $height = $image->GetImageHeight();
271
272 $out_data_sz = $width * $height * 3;
273
274 echo "Image: "
275 ." width=\"".$width."\""
276 ." height=\"".$height."\n";
277
278 // $mtx = $image->GetCTM(); // image matrix (page positioning info)
279
280 // You can use GetImageData to read the raw (decoded) image data
281 //$image->GetBitsPerComponent();
282 //$image->GetImageData(); // get raw image data
283 // .... or use Image2RGB filter that converts every image to RGB format,
284 // This should save you time since you don't need to deal with color conversions,
285 // image up-sampling, decoding etc.
286
287 $img_conv = new Image2RGB($image); // Extract and convert image to RGB 8-bpc format
288 $reader = new FilterReader($img_conv);
289
290 // A buffer used to keep image data.
291 $image_data_out = $reader->Read($out_data_sz);
292 // $image_data_out contains RGB image data.
293
294 // Note that you don't need to read a whole image at a time. Alternatively
295 // you can read a chuck at a time by repeatedly calling reader.Read(buf_sz)
296 // until the function returns 0.
297}
298
299function ProcessElements($reader)
300{
301 while (($element = $reader->Next()) != NULL) // Read page contents
302 {
303 switch ($element->GetType())
304 {
305 case Element::e_path: // Process path data...
306 {
307 ProcessPath($reader, $element);
308 }
309 break;
310 case Element::e_text_begin: // Process text block...
311 {
312 ProcessText($reader);
313 }
314 break;
315 case Element::e_form: // Process form XObjects
316 {
317 $reader->FormBegin();
318 ProcessElements($reader);
319 $reader->End();
320 }
321 break;
322 case Element::e_image: // Process Images
323 {
324 ProcessImage($element);
325 }
326 break;
327 }
328 }
329}
330
331 # Relative path to the folder containing the test files.
332 $input_path = getcwd()."/../../TestFiles/";
333 $output_path = $input_path."Output/";
334
335 PDFNet::Initialize($LicenseKey);
336 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
337
338 # Extract text data from all pages in the document
339 echo nl2br("__________________________________________________\n");
340 echo nl2br("Extract page element information from all \n");
341 echo nl2br("pages in the document.\n");
342
343 $doc = new PDFDoc($input_path."newsletter.pdf");
344 $doc->InitSecurityHandler();
345
346 $pgnum = $doc->GetPageCount();
347 $page_begin = $doc->GetPageIterator();
348
349 $page_reader = new ElementReader();
350
351 for ($itr = $page_begin; $itr->HasNext(); $itr->Next()) // Read every page
352 {
353 echo nl2br("Page ".$itr->Current()->GetIndex()."----------------------------------------\n");
354 $page_reader->Begin($itr->Current());
355 ProcessElements($page_reader);
356 $page_reader->End();
357 }
358 $doc->Close();
359 PDFNet::Terminate();
360 echo nl2br("Done.\n");
361?>

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales