Extract Text, Read, Parse PDF - TextExtract - PHP Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/newsletter.pdf";
12
13//---------------------------------------------------------------------------------------
14// This sample illustrates the basic text extraction capabilities of PDFNet.
15//---------------------------------------------------------------------------------------
16
17// A utility method used to dump all text content in the browser.
18function DumpAllText($reader)
19{
20 while (($element = $reader->Next()) != NULL)
21 {
22 switch ($element->GetType())
23 {
24 case Element::e_text_begin:
25 echo nl2br("\n--> Text Block Begin\n");
26 break;
27 case Element::e_text_end:
28 echo nl2br("\n--> Text Block End\n");
29 break;
30 case Element::e_text:
31 {
32 $bbox = $element->GetBBox();
33 echo nl2br("\n--> BBox: ".$bbox->x1.", "
34 .$bbox->y1.", "
35 .$bbox->x2.", "
36 .$bbox->y2."\n");
37
38 $arr = $element->GetTextString();
39 echo nl2br($arr."\n");
40 }
41 break;
42 case Element::e_text_new_line:
43 echo nl2br("\n--> New Line\n");
44 break;
45 case Element::e_form: // Process form XObjects
46 $reader->FormBegin();
47 DumpAllText(reader);
48 $reader->End();
49 break;
50 }
51 }
52}
53
54// A helper method for ReadTextFromRect
55function RectTextSearch($reader, $pos)
56{
57 $srch_str = "";
58 while (($element = $reader->Next()) != null)
59 {
60 switch ($element->GetType())
61 {
62 case Element::e_text:
63 {
64 $bbox = $element->GetBBox();
65 if($bbox->IntersectRect($bbox, $pos))
66 {
67 $arr = $element->GetTextString();
68 $srch_str .= $arr;
69 $srch_str .= nl2br("\n");
70 }
71 break;
72 }
73 case Element::e_text_new_line:
74 {
75 break;
76 }
77 case Element::e_form: // Process form XObjects
78 {
79 $reader->FormBegin();
80 $srch_str .= RectTextSearch($reader, $pos);
81 $reader->End();
82 break;
83 }
84 }
85 }
86 return $srch_str;
87}
88
89// A utility method used to extract all text content from
90// a given selection rectangle. The rectangle coordinates are
91// expressed in PDF user/page coordinate system.
92function ReadTextFromRect($page, $pos, $reader)
93{
94 $reader->Begin($page);
95 $str = RectTextSearch($reader, $pos);
96 $reader->End();
97 return $str;
98}
99
100function PrintStyle($style)
101{
102 $text_color = $style->GetColor();
103 $tmp = sprintf("%02X%02X%02X;", $text_color[0], $text_color[1], $text_color[2]);
104 echo " style=\"font-family:".$style->GetFontName()."; "
105 ."font-size:".$style->GetFontSize().";"
106 .($style->IsSerif() ? " sans-serif; " : " ")
107 ."color:#".$tmp."\"";
108}
109
110function IsStyleEqual($style1, $style2)
111{
112 if($style1->GetFontName() == $style2->GetFontName() &&
113 $style1->GetFontSize() == $style1->GetFontSize() &&
114 !($style1->IsSerif() xor $style1->IsSerif()) &&
115 $style1->GetColor() == $style2->GetColor() ) {
116 return true;
117 }
118 return false;
119}
120//---------------------------------------------------------------------------------------
121
122 PDFNet::Initialize($LicenseKey);
123 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
124
125 $example1_basic = false;
126 $example2_xml = false;
127 $example3_wordlist = false;
128 $example4_advanced = true;
129 $example5_low_level = false;
130
131 // Sample code showing how to use high-level text extraction APIs.
132
133 $doc = new PDFDoc($input_path);
134 $doc->InitSecurityHandler();
135
136 $page = $doc->GetPage(1);
137 if (!$page){
138 echo nl2br("Page not found.\n");
139 return;
140 }
141
142 $txt = new TextExtractor();
143 $txt->Begin($page); // Read the page.
144 // Other options you may want to consider...
145 // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
146 // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
147
148 // Example 1. Get all text on the page in a single string.
149 // Words will be separated with space or new line characters.
150 if ($example1_basic)
151 {
152 // Get the word count.
153 echo "Word Count: ".$txt->GetWordCount()."\n";
154
155 $text = $txt->GetAsText();
156 echo nl2br("\n\n- GetAsText --------------------------\n".$text."\n");
157 echo nl2br("-----------------------------------------------------------\n");
158 }
159
160 // Example 2. Get XML logical structure for the page.
161 if ($example2_xml)
162 {
163 $text = $txt->GetAsXML(TextExtractor::e_words_as_elements | TextExtractor::e_output_bbox | TextExtractor::e_output_style_info);
164 echo nl2br("\n\n- GetAsXML --------------------------\n".$text."\n");
165 echo nl2br("-----------------------------------------------------------\n");
166 }
167
168 // Example 3. Extract words one by one.
169 if ($example3_wordlist)
170 {
171 for ($line = $txt->GetFirstLine(); $line->IsValid(); $line=$line->GetNextLine()) {
172 for ($word=$line->GetFirstWord(); $word->IsValid(); $word=$word->GetNextWord()) {
173 echo nl2br($word->GetString()."\n");
174 }
175 }
176 echo nl2br("-----------------------------------------------------------\n");
177 }
178
179 // Example 4. A more advanced text extraction example.
180 // The output is XML structure containing paragraphs, lines, words,
181 // as well as style and positioning information.
182 if ($example4_advanced)
183 {
184 $cur_flow_id=-1;
185 $cur_para_id=-1;
186
187 echo nl2br("<PDFText>\n");
188 // For each line on the page...
189 for ($line=$txt->GetFirstLine(); $line->IsValid(); $line=$line->GetNextLine())
190 {
191 if ($line->GetNumWords() == 0) continue;
192
193 if ($cur_flow_id != $line->GetFlowID()) {
194 if ($cur_flow_id != -1) {
195 if ($cur_para_id != -1) {
196 $cur_para_id = -1;
197 echo nl2br("</Para>\n");
198 }
199 echo nl2br("</Flow>\n");
200 }
201 $cur_flow_id = $line->GetFlowID();
202 echo nl2br("<Flow id=\"".$cur_flow_id."\">\n");
203 }
204
205 if ($cur_para_id != $line->GetParagraphID()) {
206 if ($cur_para_id != -1)
207 echo nl2br("</Para>\n");
208 $cur_para_id = $line->GetParagraphID();
209 echo nl2br("<Para id=\"".$cur_para_id."\">\n");
210 }
211
212 $bbox1 = $line->GetBBox();
213 $line_style = $line->GetStyle();
214 printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", $bbox1->x1, $bbox1->y1, $bbox1->x2, $bbox1->y2);
215 PrintStyle($line_style);
216 echo " cur_num=\"".$line->GetCurrentNum()."\"";
217 echo nl2br(">\n");
218
219 // For each word in the line...
220 for ($word=$line->GetFirstWord(); $word->IsValid(); $word=$word->GetNextWord())
221 {
222 // Output the bounding box for the word.
223 $bbox2 = $word->GetBBox();
224 printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", $bbox2->x1, $bbox2->y1, $bbox2->x2, $bbox2->y2);
225 echo " cur_num=\"" .$word->GetCurrentNum()."\"";
226 $sz = $word->GetStringLen();
227 if ($sz == 0) continue;
228
229 // If the word style is different from the parent style, output the new style.
230 $s = $word->GetStyle();
231 if(!$s->IsEqual($line_style)){
232 PrintStyle($s);
233 }
234
235 echo ">".$word->GetString();
236 echo nl2br("</Word>\n");
237 }
238 echo nl2br("</Line>\n");
239 }
240
241 if ($cur_flow_id != -1) {
242 if ($cur_para_id != -1) {
243 $cur_para_id = -1;
244 echo nl2br("</Para>\n");
245 }
246 echo nl2br("</Flow>\n");
247
248
249 }
250 echo nl2br("</PDFText>\n");
251
252 $txt->Destroy();
253 $doc->Close();
254
255 }
256
257 if($example5_low_level)
258 {
259 $doc = new PDFDoc($input_path);
260 $doc->InitSecurityHandler();
261
262 // Example 1. Extract all text content from the document
263
264 $reader = new ElementReader();
265
266 // Read every page
267 for ($itr=$doc->GetPageIterator(); $itr->HasNext(); $itr->Next())
268 {
269 $reader->Begin($itr->Current());
270 DumpAllText($reader);
271 $reader->End();
272 }
273
274 // Example 2. Extract text content based on the
275 // selection rectangle.
276 echo nl2br("\n----------------------------------------------------");
277 echo nl2br("\nExtract text based on the selection rectangle.");
278 echo nl2br("\n----------------------------------------------------\n");
279
280 $first_page = $doc->GetPage(1);
281 $s1 = ReadTextFromRect($first_page, new Rect(27.0, 392.0, 563.0, 534.0), $reader);
282 echo nl2br("\nField 1: ".$s1);
283
284 $s1 = ReadTextFromRect($first_page, new Rect(28.0, 551.0, 106.0, 623.0), $reader);
285 echo nl2br("\nField 2: ".$s1);
286
287 $s1 = ReadTextFromRect($first_page, new Rect(208.0, 550.0, 387.0, 621.0), $reader);
288 echo nl2br("\nField 3: ".$s1);
289
290 // ...
291 $doc->Close();
292 echo nl2br("Done.\n");
293 }
294 PDFNet::Terminate();
295?>

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales