Extract Text, Read, Parse PDF - TextExtract - Node.js (JavaScript) Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11
12 exports.runTextExtractTest = async () => {
13 // A utility method used to dump all text content in the console window.
14 const dumpAllText = async (reader) => {
15 let element;
16 let bbox;
17 let arr;
18 while ((element = await reader.next()) !== null) {
19 switch (await element.getType()) {
20 case PDFNet.Element.Type.e_text_begin:
21 console.log('\n--> Text Block Begin');
22 break;
23 case PDFNet.Element.Type.e_text_end:
24 console.log('\n--> Text Block End');
25 break;
26 case PDFNet.Element.Type.e_text:
27 bbox = await element.getBBox();
28 console.log('\n--> BBox: ' + bbox.x1.toFixed(2) + ', ' + bbox.y1.toFixed(2) + ', ' + bbox.x2.toFixed(2) + ', ' + bbox.y2.toFixed(2) + '\n');
29 arr = await element.getTextString();
30 console.log(arr);
31 break;
32 case PDFNet.Element.Type.e_text_new_line:
33 console.log('\n--> New Line');
34 break;
35 case PDFNet.Element.Type.e_form:
36 reader.formBegin();
37 await dumpAllText(reader);
38 reader.end();
39 break;
40 }
41 }
42 };
43
44 // helper method for ReadTextFromRect
45 const rectTextSearch = async (reader, pos, srchStr) => {
46 let element;
47 let arr;
48 while ((element = await reader.next()) !== null) {
49 let bbox;
50 switch (await element.getType()) {
51 case PDFNet.Element.Type.e_text:
52 bbox = await element.getBBox();
53 if (await bbox.intersectRect(bbox, pos)) {
54 arr = await element.getTextString();
55 srchStr += arr + '\n';
56 }
57 break;
58 case PDFNet.Element.Type.e_text_new_line:
59 break;
60 case PDFNet.Element.Type.e_form:
61 reader.formBegin();
62 srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
63 reader.end();
64 break;
65 }
66 }
67 return srchStr;
68 };
69
70 const readTextFromRect = async (page, pos, reader) => {
71 let srchStr = '';
72 reader.beginOnPage(page); // uses default parameters.
73 srchStr += await rectTextSearch(reader, pos, srchStr);
74 reader.end();
75 return srchStr;
76 };
77
78 const twoDigitHex = function (num) {
79 const hexStr = num.toString(16).toUpperCase();
80 return ('0' + hexStr).substr(-2);
81 }
82
83 const printStyle = async (s) => {
84 const rgb = await s.getColor();
85 const rColorVal = await rgb.get(0);
86 const gColorVal = await rgb.get(1);
87 const bColorVal = await rgb.get(2);
88 const rgbHex = twoDigitHex(rColorVal) + twoDigitHex(gColorVal) + twoDigitHex(bColorVal)
89 const fontName = await s.getFontName();
90 const fontSize = await s.getFontSize();
91 const serifOutput = ((await s.isSerif()) ? ' sans-serif; ' : ' ');
92 const returnString = ' style="font-family:' + fontName + '; font-size:' + fontSize + ';' + serifOutput + 'color:#' + rgbHex + ';"';
93 return returnString;
94 };
95
96 const main = async () => {
97 // eslint-disable-next-line no-unused-vars
98 let ret = 0;
99
100 // Relative path to the folder containing test files.
101 const inputPath = '../TestFiles/';
102 const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
103
104 const example1Basic = false;
105 const example2XML = false;
106 const example3Wordlist = false;
107 const example4Advanced = true;
108 const example5LowLevel = false;
109
110 try {
111 await PDFNet.startDeallocateStack();
112 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
113 doc.initSecurityHandler();
114
115 const page = await doc.getPage(1);
116
117 if (page.id === '0') {
118 console.log('Page not found.');
119 return 1;
120 }
121
122 const txt = await PDFNet.TextExtractor.create();
123 txt.begin(page);
124
125 let text;
126 let line;
127 let word;
128
129 // Example 1. Get all text on the page in a single string.
130 // Words will be separated with space or new line characters.
131 if (example1Basic) {
132 const wordCount = await txt.getWordCount();
133 console.log('Word Count: ' + wordCount);
134 text = await txt.getAsText();
135 console.log('\n\n- GetAsText --------------------------');
136 console.log(text);
137 console.log('-----------------------------------------------------------');
138 }
139
140 // Example 2. Get XML logical structure for the page.
141 if (example2XML) {
142 text = await txt.getAsXML(PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info);
143 console.log('\n\n- GetAsXML --------------------------\n' + text);
144 console.log('-----------------------------------------------------------');
145 }
146
147 // Example 3. Extract words one by one.
148 if (example3Wordlist) {
149 line = await txt.getFirstLine();
150 for (; (await line.isValid()); line = (await line.getNextLine())) {
151 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
152 text = await word.getString();
153 console.log(text);
154 }
155 }
156 console.log('-----------------------------------------------------------');
157 }
158
159 // Example 4. A more advanced text extraction example.
160 // The output is XML structure containing paragraphs, lines, words,
161 // as well as style and positioning information.
162 if (example4Advanced) {
163 let b;
164 let q;
165 let curFlowID = -1;
166 let curParaID = -1;
167
168 console.log('<PDFText>');
169
170 // For each line on the page...
171 for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
172 if ((await line.getNumWords()) === 0) {
173 continue;
174 }
175 if (curFlowID !== await line.getFlowID()) {
176 if (curFlowID !== -1) {
177 if (curParaID !== -1) {
178 curParaID = -1;
179 console.log('</Para>');
180 }
181 console.log('</Flow>');
182 }
183 curFlowID = await line.getFlowID();
184 console.log('<Flow id="' + curFlowID + '">');
185 }
186 if (curParaID !== await line.getParagraphID()) {
187 if (curParaID !== -1) {
188 console.log('</Para>');
189 }
190 curParaID = await line.getParagraphID();
191 console.log('<Para id="' + curParaID + '">');
192 }
193 b = await line.getBBox();
194 const lineStyle = await line.getStyle();
195 let outputStringLineBox = '<Line box="' + b.x1.toFixed(2) + ', ' + b.y1.toFixed(2) + ', ' + b.x2.toFixed(2) + ', ' + b.y2.toFixed(2) + '"';
196 outputStringLineBox += (await printStyle(lineStyle));
197 const currentLineNum = await line.getCurrentNum();
198 outputStringLineBox += ' cur_num="' + currentLineNum + '">';
199 console.log(outputStringLineBox);
200
201 // For each word in the line...
202 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
203 // output bounding box for the word
204 q = await word.getBBox();
205 const currentNum = await word.getCurrentNum();
206 let outputStringWord = '<Word box="' + q.x1.toFixed(2) + ', ' + q.y1.toFixed(2) + ', ' + q.x2.toFixed(2) + ', ' + q.y2.toFixed(2) + '" cur_num="' + currentNum + '"';
207 const sz = await word.getStringLen();
208 if (sz === 0) {
209 continue;
210 }
211 // if the word style is different from the parent style, output the new style
212 const sty = await word.getStyle();
213 if (!(await sty.compare(lineStyle))) {
214 outputStringWord += await printStyle(sty);
215 }
216 outputStringWord += '>' + (await word.getString()) + '</Word>';
217 console.log(outputStringWord);
218 }
219 console.log('</Line>');
220 }
221 if (curFlowID !== -1) {
222 if (curParaID !== -1) {
223 curParaID = -1;
224 console.log('</Para>');
225 }
226 console.log('</Flow>');
227 }
228 console.log('</PDFText>');
229 }
230 await PDFNet.endDeallocateStack();
231 } catch (err) {
232 console.log(err);
233 console.log(err.stack);
234 ret = 1;
235 }
236
237
238 if (example5LowLevel) {
239 ret = 0;
240 try {
241 await PDFNet.startDeallocateStack();
242 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
243 doc.initSecurityHandler();
244
245 // Example 1. Extract all text content from the document
246 const reader = await PDFNet.ElementReader.create();
247 const itr = await doc.getPageIterator(1);
248
249 // Read every page
250 for (itr; await itr.hasNext(); itr.next()) {
251 const page = await itr.current();
252 reader.beginOnPage(page);
253 await dumpAllText(reader);
254 reader.end();
255 }
256 // Example 2. Extract text content based on the
257 // selection rectangle.
258 console.log('\n----------------------------------------------------');
259 console.log('Extract text based on the selection rectangle.');
260 console.log('----------------------------------------------------');
261
262
263 const firstPage = await (await doc.getPageIterator()).current();
264 let s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(27, 392, 563, 534)), reader);
265 console.log('\nField 1: ' + s1);
266
267 s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(28, 551, 106, 623)), reader);
268 console.log('Field 2: ' + s1);
269
270 s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(208, 550, 387, 621)), reader);
271 console.log('Field 3: ' + s1);
272
273 // ...
274 console.log('Done');
275 await PDFNet.endDeallocateStack();
276 } catch (err) {
277 console.log(err.stack);
278 ret = 1;
279 }
280 }
281 };
282 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) { console.log('Error: ' + JSON.stringify(error)); }).then(function () { return PDFNet.shutdown(); });
283 };
284 exports.runTextExtractTest();
285})(exports);
286// eslint-disable-next-line spaced-comment
287//# sourceURL=TextExtractTest.js

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales