Document Structure Extraction Showcase Demo Code Sample

Requirements
View Demo

Easily analyze the top two pages and produce a JSON that describes the PDF's structure. Preview the JSON data with a colorized legend for each extracted document element.

This sample code includes Server SDK processing in JavaScript, with UI provided by WebViewer. If a viewer is not needed, or you want to work with a different language or framework for the Server SDK, please check out our Server Smart Data Extraction Sample Code.

This demo allows you to:

  • Upload your own PDF file
  • Extract a JSON containing the elements in the PDF
  • Create a colorized legend for each extracted element
  • Update the PDF document to identify the extracted elements

Implementation steps

To add document structure extraction capability with Server SDK and a UI with WebViewer:

Step 1: Follow get-started in JavaScript for Server SDK
Step 2: Follow get-started in your preferred web stack for WebViewer
Step 3: Download Data Extraction Module
Step 4: Add the ES6 JavaScript sample code provided in this guide

In this demo, the first page in the JSON file is skipped as it contains a trial demo message. Processing starts on the next page after the trial demo page.

1// ES6 Compliant Syntax
2// Copilot name: GitHub Copilot, version: 1.0.0, model: GPT-4, version: 2024-06, date: 2025-09-15
3// File: client/index.js
4
5// **Important**
6// You must get a license key from Apryse to run the WebViewer Server SDK.
7// A trial key can be obtained from:
8// https://docs.apryse.com/core/guides/get-started/trial-key
9const licenseKey = 'YOUR_LICENSE_KEY';
10const viewerElement = document.getElementById('viewer');
11const initialDoc = 'https://apryse.s3.us-west-1.amazonaws.com/public/files/samples/sales-invoice-with-credit-cards.pdf';
12let jsonData = null;
13const documentStructureMap = [
14 { type: 'image', title: 'Image', color: { R: 1, G: 129, B: 1, A: 1 } },
15 { type: 'paragraph', title: 'Paragraph', color: { R: 254, G: 254, B: 1, A: 1 } },
16 { type: 'list', title: 'List', color: { R: 254, G: 129, B: 193, A: 1 } },
17 { type: 'heading', title: 'Heading', color: { R: 254, G: 166, B: 1, A: 1 } },
18 { type: 'header', title: 'Header', color: { R: 200, G: 20, B: 128, A: 1 } },
19 { type: 'footer', title: 'Footer', color: { R: 3, G: 219, B: 252, A: 1 } },
20 { type: 'graphic', title: 'Graphic', color: { R: 255, G: 0, B: 0, A: 1 } },
21 { type: 'td', title: 'Table Data Cell', color: { R: 51, G: 101, B: 251, A: 1 } },
22 { type: 'th', title: 'Table Header Cell', color: { R: 128, G: 0, B: 128, A: 1 } },
23 { type: 'textbox', title: 'Text Box', color: { R: 20, G: 230, B: 50, A: 1 } },
24 { type: 'group', title: 'Group', color: { R: 0, G: 0, B: 0, A: 1 } },
25];
26
27// The list of registered panels in the main webviewer.
28let viewerPanels = null;
29
30// The tab panel, representing the webviewer left panel.
31const tabPanel = {
32 handle: null,
33 dataElement: 'tabPanel'
34};
35
36// The custom document structure sub-panel to be registered.
37const documentStructurePanel = {
38 handle: null,
39 dataElement: 'documentStructurePanel',
40 render: null,
41};
42
43// Customize the main webviewer left panel after the load completion.
44const customizeUI = (instance) => {
45 const { UI } = instance;
46
47 // Close the tab panel (if it's open) for refreshment.
48 UI.closeElements([tabPanel.dataElement]);
49
50 // Get the list of registered panels in the main webviewer.
51 viewerPanels = UI.getPanels();
52
53 // Find the Tab Panel to modify. The document structure sub-panel will be added to this Tab panel.
54 tabPanel.handle = viewerPanels.find((panel) => panel.dataElement === tabPanel.dataElement);
55
56 // Register the custom document structure sub-panel.
57 RegisterDocumentStructurePanel(instance);
58
59 // Add the new custom document structure sub-panel to list of sub-panels under the Tab Panel.
60 documentStructurePanel.handle = { render: documentStructurePanel.dataElement };
61 tabPanel.handle.panelsList = [documentStructurePanel.handle, ...tabPanel.handle.panelsList];
62
63 UI.openElements([tabPanel.dataElement]);
64 UI.setPanelWidth(tabPanel.dataElement, 400);
65};
66
67// Register the custom document structure sub-panel.
68const RegisterDocumentStructurePanel = (instance) => {
69 documentStructurePanel.render = CreateDocumentStructurePanelElements(instance);
70 instance.UI.addPanel({
71 dataElement: documentStructurePanel.dataElement,
72 location: 'left',
73 icon: '<svg width="18px" height="18px" viewBox="0 0 24 24" id="圖層_1" data-name="圖層 1" xmlns="http://www.w3.org/2000/svg"><defs><style>.cls-1{fill:#080808;}</style></defs><title>form</title><path class="cls-1" d="M21,.5H3a2,2,0,0,0-2,2V22a2,2,0,0,0,2,2H21a2,2,0,0,0,2-2V2.5A2,2,0,0,0,21,.5Zm0,2v2H3v-2ZM3,22V6.5H21V22Z"/><path class="cls-1" d="M12.5,4H20a.5.5,0,0,0,0-1H12.5a.5.5,0,0,0,0,1Z"/><path class="cls-1" d="M4.5,4a.43.43,0,0,0,.19,0,.35.35,0,0,0,.16-.11A.47.47,0,0,0,5,3.5a.43.43,0,0,0,0-.19.36.36,0,0,0-.11-.16.5.5,0,0,0-.7,0A.35.35,0,0,0,4,3.31.43.43,0,0,0,4,3.5a.51.51,0,0,0,.5.5Z"/><path class="cls-1" d="M5.65,3.85A.36.36,0,0,0,5.81,4,.44.44,0,0,0,6,4a.47.47,0,0,0,.35-.15.36.36,0,0,0,.11-.16.6.6,0,0,0,0-.19.51.51,0,0,0-.15-.35A.49.49,0,0,0,5.81,3a.36.36,0,0,0-.16.11.47.47,0,0,0-.15.35.4.4,0,0,0,0,.19A.35.35,0,0,0,5.65,3.85Z"/><path class="cls-1" d="M8,8H4.5a1,1,0,0,0,0,2H8A1,1,0,0,0,8,8Z"/><path class="cls-1" d="M8,11.67H4.5a1,1,0,0,0,0,2H8a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M8,15.33H4.5a1,1,0,0,0,0,2H8a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M8,19H4.5a1,1,0,0,0,0,2H8a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,8H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,11.67H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,15.33H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M14,19H10.5a1,1,0,0,0,0,2H14a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,8h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,11.67h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,15.33h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/><path class="cls-1" d="M19.5,19h-3a1,1,0,0,0,0,2h3a1,1,0,0,0,0-2Z"/></svg>',
74 title: 'Document Structure',
75 render: () => documentStructurePanel.render,
76 });
77};
78
79// Create the document structure panel elements.
80const CreateDocumentStructurePanelElements = (instance) => {
81 let panelDiv = document.createElement('div');
82 panelDiv.id = 'documentStructure';
83 let paragraph = document.createTextNode('A sample PDF will have the first two pages analyzed and the resulting JSON can be viewed. After extraction, the displayed PDF will have annotations showing the identified elements.');
84 panelDiv.appendChild(paragraph);
85
86 const span = document.createElement("span");
87 span.style.color = 'orange';
88 span.appendChild(document.createTextNode('NOTE: Only the first two pages will be processed.'));
89 panelDiv.appendChild(document.createElement('p'));
90 panelDiv.appendChild(span);
91
92 let dividerDiv = document.createElement('div');
93 dividerDiv.style.borderTop = '1px solid #ccc';
94 dividerDiv.style.margin = '10px 0';
95 panelDiv.appendChild(dividerDiv);
96
97 // Extract document structure button.
98 let extractDocumentStructureButton = document.createElement('button');
99 extractDocumentStructureButton.textContent = 'Extract Document Structure';
100 extractDocumentStructureButton.id = 'extractDocumentStructureButton';
101 extractDocumentStructureButton.onclick = async () => {
102
103 extractDocumentStructureButton.style.cursor = "not-allowed"; // Changes cursor for the button itself
104 documentStructurePanel.render.style.cursor = "not-allowed"; // Changes cursor for the button itself
105
106 enableButton(extractDocumentStructureButton, false);
107 await extractDocumentStructure(instance); // Extract document structure
108
109 extractDocumentStructureButton.style.cursor = "default";
110 documentStructurePanel.render.style.cursor = "default";
111 }
112 enableButton(extractDocumentStructureButton, false); // Initially disabled. Enabled after the PDF is sent to the server.
113
114 panelDiv.appendChild(extractDocumentStructureButton);
115 panelDiv.appendChild(document.createElement('p'));
116
117 return panelDiv;
118};
119
120// Open JSON data in a viewer with zoom In/Out and Close buttons.
121const openJsonDataDialog = (jsonText) => {
122 let fontSize = 14;
123
124 // Create overlay.
125 const overlay = document.createElement("div");
126 overlay.className = "modal-overlay";
127 overlay.onclick = (e) => {
128 if (e.target === overlay) {
129 document.body.removeChild(overlay);
130 }
131 };
132
133 // Modal box
134 const modal = document.createElement("div");
135 modal.className = "modal-box";
136
137 // Controls
138 const controls = document.createElement("div");
139 controls.className = "modal-controls";
140
141 const zoomInBtn = document.createElement("button");
142 zoomInBtn.textContent = "+";
143 zoomInBtn.onclick = () => {
144 fontSize += 2;
145 content.style.fontSize = fontSize + "px";
146 };
147
148 const zoomOutBtn = document.createElement("button");
149 zoomOutBtn.textContent = "-";
150 zoomOutBtn.onclick = () => {
151 fontSize = Math.max(10, fontSize - 2);
152 content.style.fontSize = fontSize + "px";
153 };
154
155 const closeBtn = document.createElement("button");
156 closeBtn.textContent = "Close";
157 closeBtn.className = "modal-close";
158 closeBtn.onclick = () => {
159 document.body.removeChild(overlay);
160 };
161
162 controls.appendChild(zoomInBtn);
163 controls.appendChild(zoomOutBtn);
164 controls.appendChild(closeBtn);
165
166 // Content
167 const content = document.createElement("pre");
168 content.className = "modal-content";
169 content.style.fontSize = fontSize + "px";
170 content.innerHTML = jsonText;
171
172 modal.appendChild(controls);
173 modal.appendChild(content);
174 overlay.appendChild(modal);
175 document.body.appendChild(overlay);
176}
177
178// Draw a rectangle annotation for the given item on the specified page.
179const drawAnnotationRectangle = (instance, pageNumber, item) => {
180 const { annotationManager, Annotations } = instance.Core;
181 const color = documentStructureMap.find(field => field.type === item.type).color;
182 const annot = new Annotations.RectangleAnnotation({
183 PageNumber: pageNumber,
184 X: item.rect[0],
185 Y: item.rect[1],
186 Width: item.rect[2] - item.rect[0],
187 Height: item.rect[3] - item.rect[1],
188 StrokeColor: new Annotations.Color(color.R, color.G, color.B, color.A),
189 StrokeThickness: 2,
190 });
191
192 if (annot) {
193 annotationManager.addAnnotation(annot);
194 annotationManager.redrawAnnotation(annot);
195 }
196}
197
198// Draw annotations on the PDF when extracting document structure.
199const drawAnnotations = (instance) => {
200
201 jsonData.pages.forEach((page) => {
202 page.elements.forEach((element) => {
203 switch (element.type) {
204 case 'table':
205 element.trs.forEach((tr) => {
206 tr.tds.forEach((td) => {
207 drawAnnotationRectangle(instance, page.properties.pageNumber, td);
208 });
209 });
210 break;
211 case 'graphic':
212 if (element.contents)
213 element.contents.forEach((content) => {
214 drawAnnotationRectangle(instance, page.properties.pageNumber, content);
215 });
216 else
217 drawAnnotationRectangle(instance, page.properties.pageNumber, element);
218 break;
219 default:
220 drawAnnotationRectangle(instance, page.properties.pageNumber, element);
221 break
222 }
223 });
224 });
225}
226
227// Remove trial mode page from JSON data.
228// If demo license key is provided instead of production,
229// the first page of the received JSON data will contain
230// a message indicating that Apryse SDK is running in trial mode.
231const removeJSONTrialPage = (json) => {
232 let pageIndexToRemove = -1;
233
234 // Get the page index to remove.
235 for (const page of json.pages) {
236 for (const element of page.elements) {
237 if (element.contents && element.contents.length) {
238 for (const content of element.contents) {
239 if (content.text && content.text.includes('Apryse Data Extraction Module trial mode.')) {
240 pageIndexToRemove = page.properties.pageNumber - 1;
241 break;
242 }
243 }
244 }
245
246 if (pageIndexToRemove !== -1)
247 break;
248 }
249 if (pageIndexToRemove !== -1)
250 break;
251 }
252
253 // Remove the page and update page numbers.
254 if (pageIndexToRemove !== -1) {
255 json.pages.splice(pageIndexToRemove, 1);
256 for (let i = 0; i < json.pages.length; i++) {
257 json.pages[i].properties.pageNumber = i + 1;
258 }
259 }
260
261 return json;
262};
263
264// Extract document structure from the PDF document.
265// This function will send GET message to the server,
266// to receive the extracted document structure as JSON object.
267const extractDocumentStructure = async (instance) => {
268
269 const doc = instance.Core.documentViewer.getDocument();
270
271 // Make a GET request to get the extracted JSON data of document structure of the current PDF.
272 return new Promise(function (resolve) {
273 fetch(`http://localhost:5050/server/handler.js?filename=${doc.filename}`, {
274 method: 'GET'
275 }).then(function (response) {
276 if (response.status === 200) {
277 response.text().then(function (json) {
278 jsonData = JSON.parse(json);
279
280 // check if the received JSON data is valid.
281 if (jsonData === null || jsonData === 'undefined' || !jsonData.pages) {
282 console.error('❌ Received invalid JSON data from server');
283 resolve();
284 return;
285 }
286
287 jsonData = removeJSONTrialPage(jsonData); // Remove trial mode page from JSON data.
288 let jsonText = JSON.stringify(jsonData, null, 2);
289 jsonText = jsonText.replace(/\\r\\n/g, '\n');
290 jsonText = jsonText.replace(/\\"/g, '"');
291
292 // Display the extracted document structure color legend.
293 let colorsDiv = document.createElement('div');
294 colorsDiv.id = 'json';
295 colorsDiv.className = "listContainer";
296 const colorsTitle = document.createElement("h3");
297 colorsTitle.textContent = "Color Legend";
298 colorsDiv.appendChild(colorsTitle);
299 colorsDiv.appendChild(document.createElement('p'));
300
301 // Create list items.
302 documentStructureMap.forEach(field => {
303 const color = new instance.Core.Annotations.Color(field.color.R, field.color.G, field.color.B, field.color.A);
304 const listItem = document.createElement("div");
305 listItem.className = "listItem";
306 listItem.textContent = field.text;
307 listItem.style.setProperty("--bullet-color", color);
308 listItem.style.setProperty("color", color);
309 listItem.style.setProperty("font-weight", "bold");
310
311 // Set bullet color using ::before.
312 listItem.style.setProperty("--bullet-color", color);
313 listItem.style.setProperty("position", "relative");
314 listItem.style.setProperty("padding-left", "20px");
315 listItem.style.setProperty("margin", "8px 0");
316
317 // Add custom bullet using inline style.
318 listItem.style.setProperty("list-style", "none");
319 listItem.style.setProperty("display", "block");
320 listItem.style.setProperty("line-height", "1.5");
321 listItem.style.setProperty("font-size", "14px");
322
323 // Create bullet manually.
324 const bullet = document.createElement("span");
325 bullet.style.width = "10px";
326 bullet.style.height = "10px";
327 bullet.style.borderRadius = "50%";
328 bullet.style.backgroundColor = color;
329 bullet.style.display = "inline-block";
330 bullet.style.marginRight = "10px";
331 bullet.style.verticalAlign = "middle";
332
333 // Insert bullet before text.
334 listItem.textContent = ""; // Clear text
335 listItem.appendChild(bullet);
336 listItem.appendChild(document.createTextNode(field.title));
337
338 colorsDiv.appendChild(listItem);
339 });
340
341 documentStructurePanel.render.appendChild(colorsDiv);
342
343 // Display the extracted document structure JSON data.
344 let jsonDiv = document.createElement('div');
345 jsonDiv.id = 'json';
346 const jsonTitle = document.createElement("h3");
347 jsonTitle.textContent = "JSON Data";
348 jsonDiv.appendChild(jsonTitle);
349 jsonDiv.appendChild(document.createElement('p'));
350
351 const scrollBox = document.createElement("div");
352 scrollBox.style.width = "350px";
353 scrollBox.style.height = "350px";
354 scrollBox.style.border = "2px solid #444";
355 scrollBox.style.overflow = "scroll"; // Enables both vertical and horizontal scroll.
356 scrollBox.style.whiteSpace = "nowrap"; // Prevents wrapping for horizontal scroll.
357 scrollBox.style.padding = "10px";
358 scrollBox.style.fontFamily = "monospace";
359 scrollBox.style.backgroundColor = "black";
360 scrollBox.style.color = "white";
361
362 // Format and insert JSON data.
363 const jsonContent = document.createElement("pre");
364 jsonContent.textContent = jsonText;
365 scrollBox.appendChild(jsonContent);
366 jsonDiv.appendChild(scrollBox);
367
368 // Open JSON data dialog button.
369 let jsonDataDialogButton = document.createElement('button');
370 jsonDataDialogButton.textContent = 'Open in Dialog';
371 jsonDataDialogButton.id = 'jsonDataDialogButton';
372 jsonDataDialogButton.style.backgroundColor = 'blue';
373 jsonDataDialogButton.style.color = 'white';
374 jsonDataDialogButton.onclick = () => openJsonDataDialog(jsonText);
375 jsonDiv.appendChild(jsonDataDialogButton);
376 jsonDiv.appendChild(document.createElement('p'));
377
378 documentStructurePanel.render.appendChild(jsonDiv);
379 drawAnnotations(instance);
380 resolve();
381 })
382 }
383 else if (response.status === 500) {
384 jsonData = null;
385 resolve();
386 }
387 });
388 });
389};
390
391// Enable or disable a button based on the state.
392const enableButton = (button, state) => {
393 button.disabled = !state;
394 button.style.backgroundColor = (state) ? 'blue' : 'gray';
395 button.style.color = (state) ? 'white' : 'darkgray';
396};
397
398WebViewer({
399 path: '/lib',
400 initialDoc: initialDoc,
401 enableFilePicker: true, // Enable file picker to open files. In WebViewer -> menu icon -> Open File.
402 enableMeasurement: true,
403 loadAsPDF: true,
404 licenseKey: licenseKey,
405}, viewerElement).then(instance => {
406
407 // Once the PDF document is loaded, send it to the server.
408 // The sent PDF document will be processed by the server,
409 // by extracting document structure JSON data when the user clicks the "Extract Document Structure" button.
410 instance.Core.documentViewer.addEventListener('documentLoaded', async () => {
411
412 // Customize the main webviewer left panel after the load completion.
413 customizeUI(instance);
414
415 // Reset JSON data.
416 jsonData = null;
417
418 // Preparation of the PDF blob to be sent to the server.
419 const doc = instance.Core.documentViewer.getDocument();
420 const xfdfString = await instance.Core.annotationManager.exportAnnotations(); // obtaining annotations in the loaded document
421 const data = await doc.getFileData({ xfdfString });
422 const arr = new Uint8Array(data);
423 const blob = new Blob([arr], { type: 'application/pdf' });
424 const formData = new FormData();
425 formData.append(doc.filename, blob, doc.filename);
426
427 // Send the PDF blob to the server for processing.
428 new Promise(function (resolve, reject) {
429 console.log('🚀 Sending PDF to server for initial processing...');
430
431 fetch(`http://localhost:5050/server/handler.js?filename=${doc.filename}`, {
432 method: 'POST',
433 body: formData,
434 }).then(function (response) {
435 console.log(`📡 Server response status: ${response.status}`);
436
437 if (response.status === 200) {
438 console.log('✅ PDF successfully sent to server');
439
440 // Enable Extract Document Structure button.
441 const extractButton = documentStructurePanel.render.querySelector('#extractDocumentStructureButton');
442 if (extractButton) {
443 console.log('🔓 Enabling Extract Document Structure button');
444 enableButton(extractButton, true);
445 } else {
446 console.warn('⚠️ Could not find extractDocumentStructureButton in DOM');
447 }
448 resolve();
449 } else {
450 console.error(`❌ Server responded with status: ${response.status}`);
451 reject(new Error(`Server error: ${response.status}`));
452 }
453 }).catch(function (error) {
454 console.error('❌ Failed to connect to server:', error);
455 console.error('📍 Attempted URL: http://localhost:5050/server/handler.js');
456 console.error('🔍 This likely means the document structure extraction server is not running on port 5050');
457 reject(error);
458 });
459 }).catch(function (error) {
460 console.error('❌ Error in PDF upload promise:', error);
461 });
462 });
463
464 console.log('✅ WebViewer loaded successfully.');
465}).catch((error) => {
466 console.error('❌ Failed to initialize WebViewer:', error);
467});
468

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales