PDF Text Extractor Showcase Demo Code Sample

Requirements
View Demo

Quickly extract text from PDFs—either from entire pages or just the highlighted sections.

This demo allows you to:

  • Upload your own PDF file
  • Highlighting text to perform extraction
  • Preview extracted text: Full page and highlighted text

Implementation steps
To add PDF Text Extraction capability with WebViewer:

Step 1: Choose your preferred web stack for WebViewer
Step 2: Add the ES6 JavaScript sample code provided in this guide

1// ES6 Compliant Syntax
2// GitHub Copilot, Claude Sonnet 4 (Preview), October 14, 2025
3// File: showcase-demos/pdf-text-extractor/index.js
4import WebViewer from '@pdftron/webviewer';
5
6// Global variables to track state
7let redactionDemoFile = "https://apryse.s3.amazonaws.com/public/files/samples/section-508.pdf";
8let pageCount = 0;
9let textContent = '';
10let annotTextContent = '';
11let currentPage = 0;
12
13// Function to initialize and load the Redaction Tool
14function initializeWebViewer() {
15
16 const element = document.getElementById('viewer');
17 if (!element) {
18 console.error('Viewer div not found.');
19 return;
20 }
21
22 WebViewer({
23 path: '/lib',
24 initialDoc: redactionDemoFile,
25 licenseKey: 'YOUR_LICENSE_KEY',
26 fullAPI: true,
27 enableFilePicker: true, // Enable file picker to open files. In WebViewer -> menu icon -> Open File
28 }, element).then(instance => {
29 // define documentViewer for use in other functions
30 const { documentViewer } = instance.Core;
31 documentViewer.addEventListener('documentLoaded', () => {
32 const { UI } = instance;
33 UI.setLayoutMode(UI.LayoutMode.Single); // Set the layout mode to single page view
34 UI.disableFadePageNavigationComponent(); // Keeps the page navigation component on screen all the time
35 pageCount = documentViewer.getDocument().getPageCount(); // Update page count
36 setPage(1); //Set 1st page to trigger text extraction
37 });
38 // Event listeners for page changes
39 documentViewer.addEventListener('pageNumberUpdated', viewerUpdated);
40 // Annotation change listener to update annotation text when annotations are modified
41 instance.Core.documentViewer
42 .getAnnotationManager()
43 .addEventListener('annotationChanged', getAnnotListener);
44 // UI Section
45 createUIElements();
46 // Trigger immediate UI update if available
47 if (window.updateUIContent) {
48 window.updateUIContent();
49 }
50 });
51}
52
53// Function to extract all text from a given page
54async function getAllTextFromDocument(pageNumber) {
55 const doc = window.WebViewer.getInstance().Core.documentViewer.getDocument();
56 if (pageNumber > 0 && doc) {
57 await getText(pageNumber); // Only proceed if we have a valid document
58 await getAnnotText(pageNumber);
59 } else {
60 console.warn('Document not available or invalid page number');
61 }
62};
63
64// Function to extract text from a given page
65async function getText(pageNumber) {
66 const { documentViewer } = window.WebViewer.getInstance().Core;
67 const doc = documentViewer.getDocument();
68 // Check if document is loaded before proceeding
69 if (!doc) {
70 return;
71 }
72
73 const newPageCount = doc.getPageCount();
74 await doc.loadPageText(pageNumber, (newText) => {
75 textContent = newText;
76 pageCount = newPageCount;
77 pageNumber = pageNumber;
78 });
79}
80
81// Function to extract text under annotations on a given page
82async function getAnnotText(pageNumber) {
83 const { PDFNet, documentViewer } = window.WebViewer.getInstance().Core;
84 await PDFNet.initialize();
85 await documentViewer.getAnnotationsLoadedPromise(); // Ensure annotations are loaded
86 const doc = await getPDFDocument(documentViewer, PDFNet);
87 const annotationManager = documentViewer.getAnnotationManager();
88 const annotList = annotationManager
89 .getAnnotationsList()
90 .filter((a) => a.getPageNumber() === pageNumber);
91 const xfdf_string = await annotationManager.exportAnnotations({ annotationList: annotList });
92 const textOutput = [];
93
94 if (!doc) {
95 console.warn('PDF document not available');
96 return;
97 }
98
99 // Run PDFNet methods with memory management
100 await PDFNet.runWithCleanup(async () => {
101 // lock the document before a write operation
102 // runWithCleanup will auto unlock when complete
103 try {
104 doc.lock();
105 const fdf_doc = await PDFNet.FDFDoc.createFromXFDF(xfdf_string);
106 await doc.fdfUpdate(fdf_doc);
107 const pageTemp = await doc.getPage(pageNumber);
108 const rect = await pageTemp.getCropBox();
109 const te = await PDFNet.TextExtractor.create();
110 te.begin(pageTemp, rect);
111 const annotCount = await pageTemp.getNumAnnots();
112 for (let i = 0; i < annotCount; ++i) {
113 const annot = await pageTemp.getAnnot(i);
114 const annotText = await te.getTextUnderAnnot(annot);
115 textOutput.push(annotText);
116 }
117 } catch (e) {
118 console.log('Document no longer exists, demo probably unmounted', e);
119 }
120 annotTextContent = textOutput.join('\n');
121 updateGlobalVars(); // Update global variables after annotation extraction completes
122 // Trigger immediate UI update if available
123 if (window.updateUIContent) {
124 window.updateUIContent();
125 }
126 });
127 // Trigger UI update after annotation extraction is complete
128 if (window.updateUIElements) {
129 window.updateUIElements(pageNumber);
130 }
131};
132
133// Helper function to get PDFDoc from DocumentViewer
134async function getPDFDocument(documentViewer, PDFNet) {
135 const currentDocument = documentViewer.getDocument();
136 let doc;
137 if (!currentDocument) return;
138 if (currentDocument.type === 'office') {
139 const coreControls = window.WebViewer.getInstance().Core;
140 const buff = await currentDocument.getFileData();
141 const split = currentDocument.filename.split('.');
142 const extension = split[split.length - 1];
143 const options = {
144 extension: extension,
145 };
146 const pdfBuffer = await coreControls.officeToPDFBuffer(buff, options);
147 doc = await PDFNet.PDFDoc.createFromBuffer(pdfBuffer);
148 } else {
149 doc = await currentDocument.getPDFDoc();
150 }
151 return doc;
152};
153
154// Function to set the current page and trigger text extraction
155function setPage(pageNumber) {
156 // Validate page number
157 if (isNaN(pageNumber) || pageNumber < 0 || pageNumber > pageCount) return;
158 window.WebViewer.getInstance().Core.documentViewer.setCurrentPage(Number(pageNumber));
159 currentPage = pageNumber;
160 // Trigger text extraction for the new page
161 getAllTextFromDocument(pageNumber);
162};
163
164// Listener for annotation changes to update annotation text on the current page
165function getAnnotListener() {
166 getAllTextFromDocument(window.WebViewer.getInstance().Core.documentViewer.getCurrentPage());
167}
168
169// Function called on page change to update current page
170function viewerUpdated() {
171 setPage(window.WebViewer.getInstance().Core.documentViewer.getCurrentPage());
172}
173
174// UI Elements
175// Function to create and initialize UI elements
176function createUIElements() {
177 // Create a container for all controls (label, dropdown, and buttons)
178 // Dynamically load ui-elements.js if not already loaded
179 if (!window.SidePanel) {
180 const script = document.createElement('script');
181 script.src = '/showcase-demos/pdf-text-extractor/ui-elements.js';
182 script.onload = () => {
183 UIElements.init('viewer');
184
185 };
186 document.head.appendChild(script);
187 }
188}
189
190// Function to update global window variables
191function updateGlobalVars() {
192 window.currentPage = currentPage;
193 window.pageCount = pageCount;
194 window.textContent = textContent;
195 window.annotTextContent = annotTextContent;
196}
197
198// Initialize the WebViewer
199initializeWebViewer();
200

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales