PDF Text Extractor Showcase Demo Code Sample

Requirements
View Demo

Quickly extract text from PDFs—either from entire pages or highlighted sections only.

This demo allows you to:

  • Upload your own PDF file.
  • Highlighting text to perform extraction.
  • Preview extracted text: Full page and highlighted text.

Implementation steps
To add PDF Text Extraction capability with WebViewer:

Step 1: Choose your preferred web stack for WebViewer.
Step 2: Add the ES6 JavaScript sample code provided in this guide.

Once you generate your license key, it will automatically be included in your sample code below.

License Key

1// ES6 Compliant Syntax
2// GitHub Copilot, Claude Sonnet 4 (Preview), October 14, 2025
3// File: showcase-demos/pdf-text-extractor/index.js
4import WebViewer from '@pdftron/webviewer';
5
6const licenseKey = 'YOUR_WEBVIEWER_LICENSE_KEY';
7
8// Global variables to track state
9let redactionDemoFile = "https://apryse.s3.amazonaws.com/public/files/samples/section-508.pdf";
10let pageCount = 0;
11let textContent = '';
12let annotTextContent = '';
13let currentPage = 0;
14
15// Function to initialize and load the Redaction Tool
16function initializeWebViewer() {
17
18 const element = document.getElementById('viewer');
19 if (!element) {
20 console.error('Viewer div not found.');
21 return;
22 }
23
24 WebViewer({
25 path: '/lib',
26 initialDoc: redactionDemoFile,
27 licenseKey: licenseKey,
28 fullAPI: true,
29 enableFilePicker: true, // Enable file picker to open files. In WebViewer -> menu icon -> Open File
30 }, element).then(instance => {
31 // define documentViewer for use in other functions
32 const { documentViewer } = instance.Core;
33 documentViewer.addEventListener('documentLoaded', () => {
34 const { UI } = instance;
35 UI.setLayoutMode(UI.LayoutMode.Single); // Set the layout mode to single page view
36 UI.disableFadePageNavigationComponent(); // Keeps the page navigation component on screen all the time
37 pageCount = documentViewer.getDocument().getPageCount(); // Update page count
38 setPage(1); //Set 1st page to trigger text extraction
39 });
40 // Event listeners for page changes
41 documentViewer.addEventListener('pageNumberUpdated', viewerUpdated);
42 // Annotation change listener to update annotation text when annotations are modified
43 instance.Core.documentViewer
44 .getAnnotationManager()
45 .addEventListener('annotationChanged', getAnnotListener);
46 // UI Section
47 createUIElements();
48 // Trigger immediate UI update if available
49 if (window.updateUIContent) {
50 window.updateUIContent();
51 }
52 });
53}
54
55// Function to extract all text from a given page
56async function getAllTextFromDocument(pageNumber) {
57 const doc = window.WebViewer.getInstance().Core.documentViewer.getDocument();
58 if (pageNumber > 0 && doc) {
59 await getText(pageNumber); // Only proceed if we have a valid document
60 await getAnnotText(pageNumber);
61 } else {
62 console.warn('Document not available or invalid page number');
63 }
64};
65
66// Function to extract text from a given page
67async function getText(pageNumber) {
68 const { documentViewer } = window.WebViewer.getInstance().Core;
69 const doc = documentViewer.getDocument();
70 // Check if document is loaded before proceeding
71 if (!doc) {
72 return;
73 }
74
75 const newPageCount = doc.getPageCount();
76 await doc.loadPageText(pageNumber, (newText) => {
77 textContent = newText;
78 pageCount = newPageCount;
79 pageNumber = pageNumber;
80 });
81}
82
83// Function to extract text under annotations on a given page
84async function getAnnotText(pageNumber) {
85 const { PDFNet, documentViewer } = window.WebViewer.getInstance().Core;
86 await PDFNet.initialize();
87 await documentViewer.getAnnotationsLoadedPromise(); // Ensure annotations are loaded
88 const doc = await getPDFDocument(documentViewer, PDFNet);
89 const annotationManager = documentViewer.getAnnotationManager();
90 const annotList = annotationManager
91 .getAnnotationsList()
92 .filter((a) => a.getPageNumber() === pageNumber);
93 const xfdf_string = await annotationManager.exportAnnotations({ annotationList: annotList });
94 const textOutput = [];
95
96 if (!doc) {
97 console.warn('PDF document not available');
98 return;
99 }
100
101 // Run PDFNet methods with memory management
102 await PDFNet.runWithCleanup(async () => {
103 // lock the document before a write operation
104 // runWithCleanup will auto unlock when complete
105 try {
106 doc.lock();
107 const fdf_doc = await PDFNet.FDFDoc.createFromXFDF(xfdf_string);
108 await doc.fdfUpdate(fdf_doc);
109 const pageTemp = await doc.getPage(pageNumber);
110 const rect = await pageTemp.getCropBox();
111 const te = await PDFNet.TextExtractor.create();
112 te.begin(pageTemp, rect);
113 const annotCount = await pageTemp.getNumAnnots();
114 for (let i = 0; i < annotCount; ++i) {
115 const annot = await pageTemp.getAnnot(i);
116 const annotText = await te.getTextUnderAnnot(annot);
117 textOutput.push(annotText);
118 }
119 } catch (e) {
120 console.log('Document no longer exists, demo probably unmounted', e);
121 }
122 annotTextContent = textOutput.join('\n');
123 updateGlobalVars(); // Update global variables after annotation extraction completes
124 // Trigger immediate UI update if available
125 if (window.updateUIContent) {
126 window.updateUIContent();
127 }
128 });
129 // Trigger UI update after annotation extraction is complete
130 if (window.updateUIElements) {
131 window.updateUIElements(pageNumber);
132 }
133};
134
135// Helper function to get PDFDoc from DocumentViewer
136async function getPDFDocument(documentViewer, PDFNet) {
137 const currentDocument = documentViewer.getDocument();
138 let doc;
139 if (!currentDocument) return;
140 if (currentDocument.type === 'office') {
141 const coreControls = window.WebViewer.getInstance().Core;
142 const buff = await currentDocument.getFileData();
143 const split = currentDocument.filename.split('.');
144 const extension = split[split.length - 1];
145 const options = {
146 extension: extension,
147 };
148 const pdfBuffer = await coreControls.officeToPDFBuffer(buff, options);
149 doc = await PDFNet.PDFDoc.createFromBuffer(pdfBuffer);
150 } else {
151 doc = await currentDocument.getPDFDoc();
152 }
153 return doc;
154};
155
156// Function to set the current page and trigger text extraction
157function setPage(pageNumber) {
158 // Validate page number
159 if (isNaN(pageNumber) || pageNumber < 0 || pageNumber > pageCount) return;
160 window.WebViewer.getInstance().Core.documentViewer.setCurrentPage(Number(pageNumber));
161 currentPage = pageNumber;
162 // Trigger text extraction for the new page
163 getAllTextFromDocument(pageNumber);
164};
165
166// Listener for annotation changes to update annotation text on the current page
167function getAnnotListener() {
168 getAllTextFromDocument(window.WebViewer.getInstance().Core.documentViewer.getCurrentPage());
169}
170
171// Function called on page change to update current page
172function viewerUpdated() {
173 setPage(window.WebViewer.getInstance().Core.documentViewer.getCurrentPage());
174}
175
176// UI Elements
177// Function to create and initialize UI elements
178function createUIElements() {
179 // Create a container for all controls (label, dropdown, and buttons)
180 // Dynamically load ui-elements.js if not already loaded
181 if (!window.SidePanel) {
182 const script = document.createElement('script');
183 script.src = '/showcase-demos/pdf-text-extractor/ui-elements.js';
184 script.onload = () => {
185 UIElements.init('viewer');
186
187 };
188 document.head.appendChild(script);
189 }
190}
191
192// Function to update global window variables
193function updateGlobalVars() {
194 window.currentPage = currentPage;
195 window.pageCount = pageCount;
196 window.textContent = textContent;
197 window.annotTextContent = annotTextContent;
198}
199
200// Initialize the WebViewer
201initializeWebViewer();
202

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales