Extract Image from PDFs - PHP Sample Code

Sample code for using Apryse SDK to extract images from PDF files, along with their positioning information and DPI; provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB. Instead of converting PDF images to a Bitmap, you can also extract uncompressed/compressed image data directly using element.GetImageData() (described in the PDF Data Extraction code sample).

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/";
12$output_path = $input_path."Output/";
13
14//-----------------------------------------------------------------------------------
15// This sample illustrates one approach to PDF image extraction
16// using PDFNet.
17//
18// Note: Besides direct image export, you can also convert PDF images
19// to GDI+ Bitmap, or extract uncompressed/compressed image data directly
20// using element.GetImageData() (e.g. as illustrated in ElementReaderAdv
21// sample project).
22//-----------------------------------------------------------------------------------
23
24$image_counter = 0;
25
26function ImageExtract($reader)
27{
28 while (($element = $reader->Next()) != null)
29 {
30 switch ($element->GetType())
31 {
32 case Element::e_image:
33 case Element::e_inline_image:
34 {
35 global $image_counter;
36 echo nl2br("--> Image: ".++$image_counter."\n");
37 echo nl2br(" Width: ".$element->GetImageWidth()."\n");
38 echo nl2br(" Height: ".$element->GetImageHeight()."\n");
39 echo nl2br(" BPC: ".$element->GetBitsPerComponent()."\n");
40
41 $ctm = $element->GetCTM();
42 $x2=1.0;
43 $y2=1.0;
44 $point = $ctm->Mult(new Point($x2, $y2));
45 printf(" Coords: x1=%.2f, y1=%.2f, x2=%.2f, y2=%.2f\n", $ctm->m_h, $ctm->m_v, $point->x, $point->y);
46 if ($element->GetType() == Element::e_image)
47 {
48 $image = new Image($element->GetXObject());
49
50 $fname = "image_extract1_".$image_counter;
51 global $output_path;
52 $path = $output_path.$fname;
53 $image->Export($path);
54
55 //$path = $output_path.$fname.".tif";
56 //$image->ExportAsTiff($path);
57
58 //$path = $output_path $fname.".png";
59 //$image->ExportAsPng($path);
60 }
61 }
62 break;
63 case Element::e_form: // Process form XObjects
64 $reader->FormBegin();
65 ImageExtract($reader);
66 $reader->End();
67 break;
68 }
69 }
70}
71
72 // Initialize PDFNet
73 PDFNet::Initialize($LicenseKey);
74 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
75
76 // Example 1:
77 // Extract images by traversing the display list for
78 // every page. With this approach it is possible to obtain
79 // image positioning information and DPI.
80 $doc = new PDFDoc($input_path."newsletter.pdf");
81 $doc->InitSecurityHandler();
82
83 $reader = new ElementReader();
84 // Read every page
85 for ($itr=$doc->GetPageIterator(); $itr->HasNext(); $itr->Next())
86 {
87 $reader->Begin($itr->Current());
88 ImageExtract($reader);
89 $reader->End();
90 }
91
92 $doc->Close();
93 echo nl2br("Done.\n");
94
95 echo nl2br("----------------------------------------------------------------\n");
96
97 // Example 2:
98 // Extract images by scanning the low-level document.
99 $doc = new PDFDoc($input_path."newsletter.pdf");
100
101 $doc->InitSecurityHandler();
102 $image_counter = 0;
103
104 $cos_doc=$doc->GetSDFDoc();
105 $num_objs = $cos_doc->XRefSize();
106 for($i=1; $i<$num_objs; ++$i)
107 {
108 $obj = $cos_doc->GetObj($i);
109 if($obj != null && !$obj->IsFree() && $obj->IsStream())
110 {
111 // Process only images
112 $itr = $obj->Find("Type");
113 if(!$itr->HasNext() || !($itr->Value()->GetName() == "XObject"))
114 {
115 continue;
116 }
117
118 $itr = $obj->Find("Subtype");
119 if(!$itr->HasNext() || !($itr->Value()->GetName() == "Image"))
120 {
121 continue;
122 }
123
124 $image = new Image($obj);
125 echo nl2br("--> Image: ".++$image_counter."\n");
126 echo nl2br(" Width: ".$image->GetImageWidth()."\n");
127 echo nl2br(" Height: ".$image->GetImageHeight()."\n");
128 echo nl2br(" BPC: ".$image->GetBitsPerComponent()."\n");
129
130 $fname = "image_extract2_".$image_counter;
131 $path = $output_path.$fname;
132 $image->Export($path);
133
134 //$path = $output_path.$fname.".tif");
135 //$image->ExportAsTiff($path);
136
137 //$path = $output_path.fname.".png");
138 //$image->ExportAsPng($path);
139 }
140 }
141
142 $doc->Close();
143 PDFNet::Terminate();
144 echo nl2br("Done.\n");
145
146?>

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales