Document Structure Extraction Showcase Demo Code Sample

Easily produce a JSON file describing the structure of a PDF file. After extraction, the displayed PDF will have annotations showing the identified elements. The first two pages are analyzed resulting in a JSON file that can be viewed.

Note
In this demo the first page in the JSON file is skipped as it contains a trial demo message. Processing starts on the next page after the trial demo page.

This demo lets you:

  • Upload PDF files to extract the structure into a JSON
  • Add annotations automatically to describe the PDF structure

Implementation steps
To add Document Structure Extraction with WebViewer:

Step 1: Choose your preferred web stack
Step 2: Download any required modules listed in the Demo Dependencies section below
Step 3: Add the ES6 JavaScript sample code provided in this guide

Demo Dependencies
This sample uses the following:

Want to see a live version of this demo?

Try the Document Structure Extraction demo

1// ES6 Compliant Syntax
2// Copilot name: GitHub Copilot, version: 1.0.0, model: GPT-4, version: 2024-06, date: 2025-09-15
3// File: client/index.js
4
5// **Important**
6// You must get a license key from Apryse to the server to run.
7// A trial key can be obtained from:
8// https://docs.apryse.com/core/guides/get-started/trial-key
9const licenseKey = 'YOUR_LICENSE_KEY';
10const viewerElement = document.getElementById('viewer');
11const initialDoc = 'https://apryse.s3.us-west-1.amazonaws.com/public/files/samples/sales-invoice-with-credit-cards.pdf';
12let jsonData = null;
13const documentStructureMap = [
14 { type: 'image', title: 'Image', color: { R: 1, G: 129, B: 1, A: 1 } },
15 { type: 'paragraph', title: 'Paragraph', color: { R: 254, G: 254, B: 1, A: 1 } },
16 { type: 'list', title: 'List', color: { R: 254, G: 129, B: 193, A: 1 } },
17 { type: 'heading', title: 'Heading', color: { R: 254, G: 166, B: 1, A: 1 } },
18 { type: 'header', title: 'Header', color: { R: 200, G: 20, B: 128, A: 1 } },
19 { type: 'footer', title: 'Footer', color: { R: 3, G: 219, B: 252, A: 1 } },
20 { type: 'graphic', title: 'Graphic', color: { R: 255, G: 0, B: 0, A: 1 } },
21 { type: 'td', title: 'Table Data Cell', color: { R: 51, G: 101, B: 251, A: 1 } },
22 { type: 'th', title: 'Table Header Cell', color: { R: 128, G: 0, B: 128, A: 1 } },
23 { type: 'textbox', title: 'Text Box', color: { R: 20, G: 230, B: 50, A: 1 } },
24 { type: 'group', title: 'Group', color: { R: 0, G: 0, B: 0, A: 1 } },
25];
26
27// The list of registered panels in the main webviewer
28let viewerPanels = null;
29
30// The tab panel, representing the webviewer left panel
31const tabPanel = {
32 handle: null,
33 dataElement: 'tabPanel'
34};
35
36// The custom document structure sub-panel to be registered
37const documentStructurePanel = {
38 handle: null,
39 dataElement: 'documentStructurePanel',
40 render: null,
41};
42
43// Customize the main webviewer left panel after the load completion
44const customizeUI = (instance) => {
45 const { UI } = instance;
46
47 // close the tab panel (if it's open) for refreshment.
48 UI.closeElements([tabPanel.dataElement]);
49
50 // Get the list of registered panels in the main webviewer
51 viewerPanels = UI.getPanels();
52
53 // Find the Tab Panel to modify. The document structure sub-panel will be added to this Tab panel.
54 tabPanel.handle = viewerPanels.find((panel) => panel.dataElement === tabPanel.dataElement);
55
56 // Register the custom document structure sub-panel
57 RegisterDocumentStructurePanel(instance);
58
59 // Add the new custom document structure sub-panel to list of sub-panels under the Tab Panel
60 documentStructurePanel.handle = { render: documentStructurePanel.dataElement };
61 tabPanel.handle.panelsList = [documentStructurePanel.handle, ...tabPanel.handle.panelsList];
62
63 UI.openElements([tabPanel.dataElement]);
64 UI.setPanelWidth(tabPanel.dataElement, 400);
65};
66
67// Register the custom document structure sub-panel
68const RegisterDocumentStructurePanel = (instance) => {
69 documentStructurePanel.render = CreateDocumentStructurePanelElements(instance);
70 instance.UI.addPanel({
71 dataElement: documentStructurePanel.dataElement,
72 location: 'left',
73 icon: '<svg width="18px" height="18px" viewBox="0 0 24 24" id="圖層_1" data-name="圖層 1" xmlns="http://www.w3.org/2000/svg"><defs><style>.cls-1{fill:#080808;}</style></defs><title>form</title><path class="cls-1" d="M21,.5H3a2,2,0,0,0-2,2V22a2,2,0,0,0,2,2H21a2,2,0,0,0,2-2V2.5A2,2,0,0,0,21,.5Zm0,2v2H3v-2ZM3,22V6.5H21V22Z"/><path class="cls-1" d="M12.5,4H20a.5.5,0,0,0,0-1H12.5a.5.5,0,0,0,0,1Z"/><path class="cls-1" d="M4.5,4a.43.43,0,0,0,.19,0,.35.35,0,0,0,.16-.11A.47.47,0,0,0,5,3.5a.43.43,0,0,0,0-.19.36.36,0,0,0-.11-.16.5.5,0,0,0-.7,0A.35.35,0,0,0,4,3.31.43.43,0,0,0,4,3.5a.51.51,0,0,0,.5.5Z"/><path class="cls-1" d="M5.65,3.85A.36.36,0,0,0,5.81,4,.44.44,0,0,0,6,4a.47.47,0,0,0,.35-.15.36.36,0,0,0,.11-.16.6.6,0,0,0,0-.19.51.51,0,0,0-.15-.35A.49.49,0,0,0,5.81,3a.36.36,0,0,0-.16.11.47.47,0,0,0-.15.35.4.4,0,0,0,0,.19A.35.35,0,0,0,5.65,3.85Z"/><path class="cls-1" d="M8,8H4.5a1,1,0,0,0,0,2H8A1,1,0,0,0,8,8Z"/><path class="cls-1" d="M8,11.67H4.5a1,1,0,0,0,0,2H8a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M8,15.33H4.5a1,1,0,0,0,0,2H8a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M8,19H4.5a1,1,0,0,0,0,2H8a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,8H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,11.67H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,15.33H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,19H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,8h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,11.67h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,15.33h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,19h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/></svg>',
74 title: 'Document Structure',
75 render: () => documentStructurePanel.render,
76 });
77};
78
79// Create the document structure panel elements.
80const CreateDocumentStructurePanelElements = (instance) => {
81 let panelDiv = document.createElement('div');
82 panelDiv.id = 'documentStructure';
83 let paragraph = document.createTextNode('A sample PDF will have the first two pages analyzed and the resulting JSON can be viewed. After extraction, the displayed PDF will have annotations showing the identified elements.');
84 panelDiv.appendChild(paragraph);
85
86 const span = document.createElement("span");
87 span.style.color = 'orange';
88 span.appendChild(document.createTextNode('NOTE: Only the first two pages will be processed.'));
89 panelDiv.appendChild(document.createElement('p'));
90 panelDiv.appendChild(span);
91
92 let dividerDiv = document.createElement('div');
93 dividerDiv.style.borderTop = '1px solid #ccc';
94 dividerDiv.style.margin = '10px 0';
95 panelDiv.appendChild(dividerDiv);
96
97 // Extract document structure button
98 let extractDocumentStructureButton = document.createElement('button');
99 extractDocumentStructureButton.textContent = 'Extract Document Structure';
100 extractDocumentStructureButton.id = 'extractDocumentStructureButton';
101 extractDocumentStructureButton.disabled = true;
102 extractDocumentStructureButton.style.backgroundColor = 'gray';
103 extractDocumentStructureButton.style.color = 'darkgray';
104 extractDocumentStructureButton.onclick = async () => {
105
106 extractDocumentStructureButton.style.cursor = "not-allowed"; // Changes cursor for the button itself
107 documentStructurePanel.render.style.cursor = "not-allowed"; // Changes cursor for the button itself
108
109 enableButton(extractDocumentStructureButton, false);
110 await extractDocumentStructure(instance); // Extract document structure
111
112 extractDocumentStructureButton.style.cursor = "default";
113 documentStructurePanel.render.style.cursor = "default";
114 }
115
116 panelDiv.appendChild(extractDocumentStructureButton);
117 panelDiv.appendChild(document.createElement('p'));
118
119 return panelDiv;
120};
121
122// Open JSON data in a viewer with zoom in/out and close buttons
123const openJsonDataDialog = (jsonText) => {
124 let fontSize = 14;
125
126 // Create overlay
127 const overlay = document.createElement("div");
128 overlay.className = "modal-overlay";
129 overlay.onclick = (e) => {
130 if (e.target === overlay) {
131 document.body.removeChild(overlay);
132 }
133 };
134
135 // Modal box
136 const modal = document.createElement("div");
137 modal.className = "modal-box";
138
139 // Controls
140 const controls = document.createElement("div");
141 controls.className = "modal-controls";
142
143 const zoomInBtn = document.createElement("button");
144 zoomInBtn.textContent = "+";
145 zoomInBtn.onclick = () => {
146 fontSize += 2;
147 content.style.fontSize = fontSize + "px";
148 };
149
150 const zoomOutBtn = document.createElement("button");
151 zoomOutBtn.textContent = "-";
152 zoomOutBtn.onclick = () => {
153 fontSize = Math.max(10, fontSize - 2);
154 content.style.fontSize = fontSize + "px";
155 };
156
157 const closeBtn = document.createElement("button");
158 closeBtn.textContent = "Close";
159 closeBtn.className = "modal-close";
160 closeBtn.onclick = () => {
161 document.body.removeChild(overlay);
162 };
163
164 controls.appendChild(zoomInBtn);
165 controls.appendChild(zoomOutBtn);
166 controls.appendChild(closeBtn);
167
168 // Content
169 const content = document.createElement("pre");
170 content.className = "modal-content";
171 content.style.fontSize = fontSize + "px";
172 content.innerHTML = jsonText;
173
174 modal.appendChild(controls);
175 modal.appendChild(content);
176 overlay.appendChild(modal);
177 document.body.appendChild(overlay);
178}
179
180// Draw a rectangle annotation for the given item on the specified page
181const drawAnnotationRectangle = (instance, pageNumber, item) => {
182 const { annotationManager, Annotations } = instance.Core;
183 const color = documentStructureMap.find(field => field.type === item.type).color;
184 const annot = new Annotations.RectangleAnnotation({
185 PageNumber: pageNumber,
186 X: item.rect[0],
187 Y: item.rect[1],
188 Width: item.rect[2] - item.rect[0],
189 Height: item.rect[3] - item.rect[1],
190 StrokeColor: new Annotations.Color(color.R, color.G, color.B, color.A),
191 StrokeThickness: 2,
192 });
193
194 if (annot) {
195 annotationManager.addAnnotation(annot);
196 annotationManager.redrawAnnotation(annot);
197 }
198}
199
200// draw annotations on the PDF when extracting document structure
201const drawAnnotations = (instance) => {
202
203 jsonData.pages.forEach((page) => {
204 page.elements.forEach((element) => {
205 switch (element.type) {
206 case 'table':
207 element.trs.forEach((tr) => {
208 tr.tds.forEach((td) => {
209 drawAnnotationRectangle(instance, page.properties.pageNumber, td);
210 });
211 });
212 break;
213 case 'graphic':
214 if (element.contents)
215 element.contents.forEach((content) => {
216 drawAnnotationRectangle(instance, page.properties.pageNumber, content);
217 });
218 else
219 drawAnnotationRectangle(instance, page.properties.pageNumber, element);
220 break;
221 default:
222 drawAnnotationRectangle(instance, page.properties.pageNumber, element);
223 break
224 }
225 });
226 });
227}
228
229// Remove trial mode page from JSON data.
230// If demo license key is provided instead of production,
231// the first page of the received JSON data will contain
232// a message indicating that Apryse SDK is running in trial mode.
233const removeJSONTrialPage = (json) => {
234 let pageIndexToRemove = -1;
235
236 // Get the page index to remove
237 for (const page of json.pages) {
238 for (const element of page.elements) {
239 if (element.contents && element.contents.length) {
240 for (const content of element.contents) {
241 if (content.text && content.text.includes('Apryse Data Extraction Module trial mode.')) {
242 pageIndexToRemove = page.properties.pageNumber - 1;
243 break;
244 }
245 }
246 }
247
248 if (pageIndexToRemove !== -1)
249 break;
250 }
251 if (pageIndexToRemove !== -1)
252 break;
253 }
254
255 // Remove the page and update page numbers
256 if (pageIndexToRemove !== -1) {
257 json.pages.splice(pageIndexToRemove, 1);
258 for (let i = 0; i < json.pages.length; i++) {
259 json.pages[i].properties.pageNumber = i + 1;
260 }
261 }
262
263 return json;
264};
265
266// Remove watermark rectangle from JSON data
267const removeWatermarkRect = (json) => {
268 const apryseWatermarkRect = [214.67, 270.59, 397.85, 523.01];
269 json.pages.forEach((page) => {
270 page.elements = page.elements.filter((element) =>
271 element?.rect.every((v, i) => v !== apryseWatermarkRect[i])
272 );
273 });
274
275 return json;
276};
277
278// Extract document structure from the PDF document
279// This function will send GET message to the server,
280// to receive the extracted document structure as JSON object.
281const extractDocumentStructure = async (instance) => {
282
283 const doc = instance.Core.documentViewer.getDocument();
284
285 // Make a GET request to get the extracted JSON data of document structure of the current PDF.
286 return new Promise(function (resolve) {
287 fetch(`http://localhost:5050/server/handler.js?filename=${doc.filename}`, {
288 method: 'GET'
289 }).then(function (response) {
290 if (response.status === 200) {
291 response.text().then(function (json) {
292 jsonData = JSON.parse(json);
293
294 // check if the received JSON data is valid
295 if (jsonData === null || jsonData === 'undefined' || !jsonData.pages) {
296 console.error('❌ Received invalid JSON data from server');
297 resolve();
298 return;
299 }
300
301 jsonData = removeJSONTrialPage(jsonData); // remove trial mode page from JSON data
302 jsonData = removeWatermarkRect(jsonData); // remove watermark rectangle from JSON data
303 let jsonText = JSON.stringify(jsonData, null, 2);
304 jsonText = jsonText.replace(/\\r\\n/g, '\n');
305 jsonText = jsonText.replace(/\\"/g, '"');
306
307 // Display the extracted document structure color legend
308 let colorsDiv = document.createElement('div');
309 colorsDiv.id = 'json';
310 colorsDiv.className = "listContainer";
311 const colorsTitle = document.createElement("h3");
312 colorsTitle.textContent = "Color Legend";
313 colorsDiv.appendChild(colorsTitle);
314 colorsDiv.appendChild(document.createElement('p'));
315
316 // Create list items
317 documentStructureMap.forEach(field => {
318 const color = new instance.Core.Annotations.Color(field.color.R, field.color.G, field.color.B, field.color.A);
319 const listItem = document.createElement("div");
320 listItem.className = "listItem";
321 listItem.textContent = field.text;
322 listItem.style.setProperty("--bullet-color", color);
323 listItem.style.setProperty("color", color);
324 listItem.style.setProperty("font-weight", "bold");
325
326 // Set bullet color using ::before
327 listItem.style.setProperty("--bullet-color", color);
328 listItem.style.setProperty("position", "relative");
329 listItem.style.setProperty("padding-left", "20px");
330 listItem.style.setProperty("margin", "8px 0");
331
332 // Add custom bullet using inline style
333 listItem.style.setProperty("list-style", "none");
334 listItem.style.setProperty("display", "block");
335 listItem.style.setProperty("line-height", "1.5");
336 listItem.style.setProperty("font-size", "14px");
337
338 // Create bullet manually
339 const bullet = document.createElement("span");
340 bullet.style.width = "10px";
341 bullet.style.height = "10px";
342 bullet.style.borderRadius = "50%";
343 bullet.style.backgroundColor = color;
344 bullet.style.display = "inline-block";
345 bullet.style.marginRight = "10px";
346 bullet.style.verticalAlign = "middle";
347
348 // Insert bullet before text
349 listItem.textContent = ""; // Clear text
350 listItem.appendChild(bullet);
351 listItem.appendChild(document.createTextNode(field.title));
352
353 colorsDiv.appendChild(listItem);
354 });
355
356 documentStructurePanel.render.appendChild(colorsDiv);
357
358 // Display the extracted document structure JSON data
359 let jsonDiv = document.createElement('div');
360 jsonDiv.id = 'json';
361 const jsonTitle = document.createElement("h3");
362 jsonTitle.textContent = "JSON Data";
363 jsonDiv.appendChild(jsonTitle);
364 jsonDiv.appendChild(document.createElement('p'));
365
366 const scrollBox = document.createElement("div");
367 scrollBox.style.width = "350px";
368 scrollBox.style.height = "350px";
369 scrollBox.style.border = "2px solid #444";
370 scrollBox.style.overflow = "scroll"; // Enables both vertical and horizontal scroll
371 scrollBox.style.whiteSpace = "nowrap"; // Prevents wrapping for horizontal scroll
372 scrollBox.style.padding = "10px";
373 scrollBox.style.fontFamily = "monospace";
374 scrollBox.style.backgroundColor = "black";
375 scrollBox.style.color = "white";
376
377 // Format and insert JSON data
378 const jsonContent = document.createElement("pre");
379 jsonContent.textContent = jsonText;
380 scrollBox.appendChild(jsonContent);
381 jsonDiv.appendChild(scrollBox);
382
383 // Open JSON data dialog button
384 let jsonDataDialogButton = document.createElement('button');
385 jsonDataDialogButton.textContent = 'Open in Dialog';
386 jsonDataDialogButton.id = 'jsonDataDialogButton';
387 jsonDataDialogButton.style.backgroundColor = 'blue';
388 jsonDataDialogButton.style.color = 'white';
389 jsonDataDialogButton.onclick = () => openJsonDataDialog(jsonText);
390 jsonDiv.appendChild(jsonDataDialogButton);
391 jsonDiv.appendChild(document.createElement('p'));
392
393 documentStructurePanel.render.appendChild(jsonDiv);
394 drawAnnotations(instance);
395 resolve();
396 })
397 }
398 else if (response.status === 500) {
399 jsonData = null;
400 resolve();
401 }
402 });
403 });
404};
405
406// Enable or disable a button based on the state
407const enableButton = (button, state) => {
408 button.disabled = !state;
409 button.style.backgroundColor = (state) ? 'blue' : 'gray';
410 button.style.color = (state) ? 'white' : 'darkgray';
411};
412
413WebViewer({
414 path: '/lib',
415 initialDoc: initialDoc,
416 enableFilePicker: true, // Enable file picker to open files. In WebViewer -> menu icon -> Open File
417 enableMeasurement: true,
418 loadAsPDF: true,
419 licenseKey: licenseKey,
420}, viewerElement).then(instance => {
421
422 // Once the PDF document is loaded, send it to the server.
423 // The sent PDF document will be processed by the server,
424 // by extracting document structure JSON data when the user clicks the "Extract Document Structure" button.
425 instance.Core.documentViewer.addEventListener('documentLoaded', async () => {
426
427 // Customize the main webviewer left panel after the load completion
428 customizeUI(instance);
429
430 // Reset JSON data
431 jsonData = null;
432
433 // Preparation of the PDF blob to be sent to the server
434 const doc = instance.Core.documentViewer.getDocument();
435 const xfdfString = await instance.Core.annotationManager.exportAnnotations(); // obtaining annotations in the loaded document
436 const data = await doc.getFileData({ xfdfString });
437 const arr = new Uint8Array(data);
438 const blob = new Blob([arr], { type: 'application/pdf' });
439 const formData = new FormData();
440 formData.append(doc.filename, blob, doc.filename);
441
442 // Send the PDF blob to the server for processing
443 new Promise(function (resolve, reject) {
444 console.log('🚀 Sending PDF to server for initial processing...');
445
446 fetch(`http://localhost:5050/server/handler.js?filename=${doc.filename}`, {
447 method: 'POST',
448 body: formData,
449 }).then(function (response) {
450 console.log(`📡 Server response status: ${response.status}`);
451
452 if (response.status === 200) {
453 console.log('✅ PDF successfully sent to server');
454
455 // enable Extract Document Structure button
456 const extractButton = documentStructurePanel.render.querySelector('#extractDocumentStructureButton');
457 if (extractButton) {
458 console.log('🔓 Enabling Extract Document Structure button');
459 enableButton(extractButton, true);
460 } else {
461 console.warn('⚠️ Could not find extractDocumentStructureButton in DOM');
462 }
463 resolve();
464 } else {
465 console.error(`❌ Server responded with status: ${response.status}`);
466 reject(new Error(`Server error: ${response.status}`));
467 }
468 }).catch(function (error) {
469 console.error('❌ Failed to connect to server:', error);
470 console.error('📍 Attempted URL: http://localhost:5050/server/handler.js');
471 console.error('🔍 This likely means the document structure extraction server is not running on port 5050');
472 reject(error);
473 });
474 }).catch(function (error) {
475 console.error('❌ Error in PDF upload promise:', error);
476 });
477 });
478
479 console.log('✅ WebViewer loaded successfully.');
480}).catch((error) => {
481 console.error('❌ Failed to initialize WebViewer:', error);
482});
483

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales