PDF Logical Structure Reader - PHP Sample Code

Sample code for using Apryse Server SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/";
12$output_path = $input_path."Output/";
13
14//---------------------------------------------------------------------------------------
15// This sample explores the structure and content of a tagged PDF document and dumps
16// the structure information to the console window.
17//
18// In tagged PDF documents StructTree acts as a central repository for information
19// related to a PDF document's logical structure. The tree consists of StructElement-s
20// and ContentItem-s which are leaf nodes of the structure tree.
21//
22// The sample can be extended to access and extract the marked-content elements such
23// as text and images.
24//---------------------------------------------------------------------------------------
25
26function PrintIdent($ident) { echo nl2br("\n"); for ($i=0; $i<$ident; ++$i) echo " "; }
27
28// Used in code snippet 1.
29function ProcessStructElement($element, $ident)
30{
31 if (!$element->IsValid()) {
32 return;
33 }
34
35 // Print out the type and title info, if any.
36 PrintIdent($ident++);
37 echo "Type: ".$element->GetType();
38 if ($element->HasTitle()) {
39 echo ". Title: ".$element->GetTitle();
40 }
41
42 $num = $element->GetNumKids();
43 for ($i=0; $i<$num; ++$i)
44 {
45 // Check is the kid is a leaf node (i.e. it is a ContentItem).
46 if ($element->IsContentItem($i)) {
47 $cont = $element->GetAsContentItem($i);
48 $type = $cont->GetType();
49
50 $page = $cont->GetPage();
51
52 PrintIdent($ident);
53 echo "Content Item. Part of page #".$page->GetIndex();
54
55 PrintIdent($ident);
56 switch ($type) {
57 case ContentItem::e_MCID:
58 case ContentItem::e_MCR:
59 echo "MCID: ".$cont->GetMCID();
60 break;
61 case ContentItem::e_OBJR:
62 {
63 echo "OBJR ";
64 if ($ref_obj = $cont->GetRefObj())
65 echo "- Referenced Object#: ".$ref_obj->GetObjNum();
66 }
67 break;
68 default:
69 break;
70 }
71 }
72 else { // the kid is another StructElement node.
73 ProcessStructElement($element->GetAsStructElem($i), $ident);
74 }
75 }
76}
77
78// Used in code snippet 2.
79function ProcessElements($reader)
80{
81 while ($element = $reader->Next()) // Read page contents
82 {
83 // In this sample we process only paths & text, but the code can be
84 // extended to handle any element type.
85 $type = $element->GetType();
86 if ($type == Element::e_path || $type == Element::e_text || $type == Element::e_path)
87 {
88 switch ($type) {
89 case Element::e_path: // Process path ...
90 echo nl2br("\nPATH: ");
91 break;
92 case Element::e_text: // Process text ...
93 echo nl2br("\nTEXT: ".$element->GetTextString()."\n");
94 break;
95 case Element::e_form: // Process form XObjects
96 echo nl2br("\nFORM XObject: ");
97 //$reader->FormBegin();
98 //ProcessElements($reader);
99 //$reader->End();
100 break;
101 }
102
103 // Check if the element is associated with any structural element.
104 // Content items are leaf nodes of the structure tree.
105 $struct_parent = $element->GetParentStructElement();
106 if ($struct_parent->IsValid()) {
107 // Print out the parent structural element's type, title, and object number.
108 echo " Type: ".$struct_parent->GetType()
109 .", MCID: ".$element->GetStructMCID();
110 if ($struct_parent->HasTitle()) {
111 echo ". Title: ".$struct_parent->GetTitle();
112 }
113 echo ", Obj#: ".$struct_parent->GetSDFObj()->GetObjNum();
114 }
115 }
116 }
117}
118
119// Used in code snippet 3.
120function ProcessElements2($reader, &$mcid_page_map)
121{
122 while (($element = $reader->Next()) != null) // Read page contents
123 {
124 // In this sample we process only text, but the code can be extended
125 // to handle paths, images, or any other Element type.
126 $mcid = $element->GetStructMCID();
127 if ($mcid>= 0 && $element->GetType() == Element::e_text) {
128 $val = $element->GetTextString();
129 $exist = array_key_exists($mcid, $mcid_page_map);
130 if ($exist == true) {
131 $mcid_page_map[$mcid] = $mcid_page_map[$mcid].$val;
132 }
133 else {
134 $mcid_page_map[$mcid] = $val;
135 }
136 }
137 }
138}
139
140// Used in code snippet 3.
141function ProcessStructElement2($element, &$mcid_doc_map, $ident)
142{
143 if (!$element->IsValid()) {
144 return;
145 }
146
147 // Print out the type and title info, if any.
148 PrintIdent($ident);
149 echo "<".$element->GetType();
150 if ($element->HasTitle()) {
151 echo " title=\"".$element->GetTitle()."\"";
152 }
153 echo ">";
154
155 $num = $element->GetNumKids();
156 for ($i=0; $i<$num; ++$i)
157 {
158 if ($element->IsContentItem($i)) {
159 $cont = $element->GetAsContentItem($i);
160 if ($cont->GetType() == ContentItem::e_MCID) {
161 $page_num = $cont->GetPage()->GetIndex();
162 if (array_key_exists($page_num, $mcid_doc_map)) {
163 $mcid_page_map = $mcid_doc_map[$page_num];
164 if (array_key_exists($cont->GetMCID(), $mcid_page_map)) {
165 echo $mcid_page_map[$cont->GetMCID()];
166 }
167 }
168 }
169 }
170 else { // the kid is another StructElement node.
171 ProcessStructElement2($element->GetAsStructElem($i), $mcid_doc_map, $ident+1);
172 }
173 }
174
175 PrintIdent($ident);
176 echo "</".$element->GetType().">";
177}
178
179 PDFNet::Initialize($LicenseKey);
180 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
181
182 // Extract logical structure from a PDF document
183
184 $doc = new PDFDoc($input_path."tagged.pdf");
185 $doc->InitSecurityHandler();
186
187 echo nl2br("____________________________________________________________\n");
188 echo nl2br("Sample 1 - Traverse logical structure tree...\n");
189
190 $tree = $doc->GetStructTree();
191 if ($tree->IsValid()) {
192 echo nl2br("Document has a StructTree root.\n");
193
194 for ($i=0; $i<$tree->GetNumKids(); ++$i) {
195 // Recursively get structure info for all child elements.
196 ProcessStructElement($tree->GetKid($i), 0);
197 }
198 }
199 else {
200 echo nl2br("This document does not contain any logical structure.\n");
201 }
202
203 echo nl2br("\nDone 1.\n");
204
205 echo nl2br("____________________________________________________________\n");
206 echo nl2br("Sample 2 - Get parent logical structure elements from\n");
207 echo nl2br("layout elements.\n");
208
209 $reader = new ElementReader();
210 for ($itr = $doc->GetPageIterator(); $itr->HasNext(); $itr->Next()) {
211 $reader->Begin($itr->Current());
212 ProcessElements($reader);
213 $reader->End();
214 }
215
216 echo nl2br("\nDone 2.\n");
217
218 echo nl2br("____________________________________________________________\n");
219 echo nl2br("Sample 3 - 'XML style' extraction of PDF logical structure and page content.\n");
220
221 $mcid_doc_map = array();
222 $reader = new ElementReader();
223 for ($itr = $doc->GetPageIterator(); $itr->HasNext(); $itr->Next()) {
224 $reader->Begin($itr->Current());
225 $mcid_doc_map[$itr->Current()->GetIndex()] = array();
226 ProcessElements2($reader, $mcid_doc_map[$itr->Current()->GetIndex()]);
227 $reader->End();
228 }
229 $tree = $doc->GetStructTree();
230 if ($tree->IsValid()) {
231 for ($i=0; $i<$tree->GetNumKids(); ++$i) {
232 ProcessStructElement2($tree->GetKid($i), $mcid_doc_map, 0);
233 }
234 }
235
236 echo nl2br("\nDone 3.\n");
237 $doc->Save(($output_path ."LogicalStructure.pdf"), SDFDoc::e_linearized);
238 $doc->Close();
239 PDFNet::Terminate();
240?>

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales