TextExtraction

Sample JavaScript code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Web SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6(exports => {
7 const PDFNet = exports.Core.PDFNet;
8
9 exports.runTextExtractTest = async () => {
10 // A utility method used to dump all text content in the console window.
11 const dumpAllText = async reader => {
12 let element;
13 let bbox;
14 let arr;
15 while ((element = await reader.next()) !== null) {
16 switch (await element.getType()) {
17 case PDFNet.Element.Type.e_text_begin:
18 console.log('--> Text Block Begin');
19 break;
20 case PDFNet.Element.Type.e_text_end:
21 console.log('--> Text Block End');
22 break;
23 case PDFNet.Element.Type.e_text:
24 bbox = await element.getBBox();
25 console.log('--> BBox: ' + bbox.x1 + ', ' + bbox.y1 + ', ' + bbox.x2 + ', ' + bbox.y2 + '\n');
26 arr = await element.getTextString();
27 console.log(arr);
28 break;
29 case PDFNet.Element.Type.e_text_new_line:
30 break;
31 case PDFNet.Element.Type.e_form:
32 reader.formBegin();
33 await dumpAllText(reader);
34 reader.end();
35 break;
36 }
37 }
38 };
39
40 // helper method for ReadTextFromRect
41 const rectTextSearch = async (reader, pos, srchStr) => {
42 let element;
43 let arr;
44 while ((element = await reader.next()) !== null) {
45 let bbox;
46 switch (await element.getType()) {
47 case PDFNet.Element.Type.e_text:
48 bbox = await element.getBBox();
49 if (await bbox.intersectRect(bbox, pos)) {
50 arr = await element.getTextString();
51 srchStr += arr + '\n';
52 }
53 break;
54 case PDFNet.Element.Type.e_text_new_line:
55 break;
56 case PDFNet.Element.Type.e_form:
57 reader.formBegin();
58 srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
59 reader.end();
60 break;
61 }
62 }
63 return srchStr;
64 };
65
66 const readTextFromRect = async (page, pos, reader) => {
67 let srchStr = '';
68 reader.beginOnPage(page); // uses default parameters.
69 srchStr += await rectTextSearch(reader, pos, srchStr);
70 reader.end();
71 return srchStr;
72 };
73
74 const printStyle = async s => {
75 const rgb = await s.getColor();
76 const rColorVal = await rgb.get(0);
77 const gColorVal = await rgb.get(1);
78 const bColorVal = await rgb.get(2);
79 const fontName = await s.getFontName();
80 const fontSize = await s.getFontSize();
81 const serifOutput = (await s.isSerif()) ? ' sans-serif; ' : ' ';
82 const returnString =
83 'style="font-family:' + fontName + ';font-size:' + fontSize + ';' + serifOutput + 'color: #' + rColorVal.toString(16) + ', ' + gColorVal.toString(16) + ', ' + bColorVal.toString(16) + ')"';
84 return returnString;
85 };
86
87 const main = async () => {
88 console.log('Beginning Test');
89
90 // Relative path to the folder containing test files.
91 const inputURL = '../TestFiles/';
92 const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
93
94 const example1Basic = false;
95 const example2XML = false;
96 const example3Wordlist = false;
97 const example4Advanced = true;
98 const example5LowLevel = false;
99 let doc = null;
100
101 try {
102 await PDFNet.startDeallocateStack();
103 doc = await PDFNet.PDFDoc.createFromURL(inputURL + inputFilename);
104 doc.initSecurityHandler();
105 doc.lock();
106
107 const page = await doc.getPage(1);
108
109 if (page.id === '0') {
110 console.log('Page not found.');
111 return 1;
112 }
113
114 const txt = await PDFNet.TextExtractor.create();
115 const rect = new PDFNet.Rect(0, 0, 612, 794);
116 txt.begin(page, rect);
117
118 let text;
119 let line;
120 let word;
121
122 if (example1Basic) {
123 const wordCount = await txt.getWordCount();
124 console.log('Word Count: ' + wordCount);
125 text = await txt.getAsText();
126 console.log('- GetAsText -------------------------------');
127 console.log(text);
128 console.log('-----------------------------------------');
129 }
130
131 if (example2XML) {
132 text = await txt.getAsXML(
133 PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info
134 );
135 console.log('- GetAsXML --------------------------' + text);
136 console.log('-----------------------------------------------------------');
137 }
138
139 if (example3Wordlist) {
140 line = await txt.getFirstLine();
141 for (; await line.isValid(); line = await line.getNextLine()) {
142 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
143 text = await word.getString();
144 console.log(text);
145 }
146 }
147 console.log('-----------------------------------------------------------');
148 }
149
150 if (example4Advanced) {
151 let b;
152 let q;
153 let curFlowID = -1;
154 let curParaID = -1;
155
156 for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
157 if ((await line.getNumWords()) === 0) {
158 continue;
159 }
160 if ((await line.getFlowID()) !== curFlowID) {
161 if (curFlowID !== -1) {
162 if (curParaID !== -1) {
163 curParaID = -1;
164 console.log('</Para>');
165 }
166 console.log('</Flow>');
167 }
168 curFlowID = await line.getFlowID();
169 console.log('<Flow id="' + curFlowID + '">');
170 }
171 if ((await line.getParagraphID()) !== curParaID) {
172 if (curParaID !== -1) {
173 console.log('</Para>');
174 }
175 curParaID = await line.getParagraphID();
176 console.log('<Para id="' + curParaID + '">');
177 }
178 b = await line.getBBox();
179 const lineStyle = await line.getStyle();
180 let outputStringLineBox = '<Line box="' + b.x1 + ', ' + b.y1 + ', ' + b.x2 + ', ' + b.y1 + '">';
181 outputStringLineBox += await printStyle(lineStyle);
182 const currentLineNum = await line.getCurrentNum();
183 outputStringLineBox += ' cur_num="' + currentLineNum + '">';
184 console.log(outputStringLineBox);
185
186 // For each word in the line...
187 let outputStringWord = '';
188 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
189 // output bounding box for the word
190 q = await word.getBBox();
191 const currentNum = await word.getCurrentNum();
192 outputStringWord += '<Word box="' + q.x1 + ', ' + q.y1 + ', ' + q.x2 + ', ' + q.y2 + '" cur_num="' + currentNum + '"';
193 const sz = await word.getStringLen();
194 if (sz === 0) {
195 continue;
196 }
197 // if the word style is different from the parent style, output the new style
198 const sty = await word.getStyle();
199 if (!(await sty.compare(lineStyle))) {
200 console.log(await printStyle(sty));
201 }
202 outputStringWord += '>' + (await word.getString()) + '</Word>';
203 console.log(outputStringWord);
204 }
205 console.log('</Line>');
206 }
207 if (curFlowID !== -1) {
208 if (curParaID !== -1) {
209 curParaID = -1;
210 console.log('</Para>');
211 }
212 console.log('</Flow>\n');
213 }
214 }
215 console.log('done');
216 await PDFNet.endDeallocateStack();
217 } catch (err) {
218 console.log(err);
219 console.log(err.stack);
220 }
221
222 if (example5LowLevel) {
223 try {
224 await PDFNet.startDeallocateStack();
225 doc = await PDFNet.PDFDoc.createFromURL(inputURL + inputFilename);
226 doc.initSecurityHandler();
227 doc.lock();
228
229 // Example 1. Extract all text content from the document
230 const reader = await PDFNet.ElementReader.create();
231 const itr = await doc.getPageIterator(1);
232
233 // Read every page
234 for (itr; await itr.hasNext(); itr.next()) {
235 const page = await itr.current();
236 reader.beginOnPage(page);
237 await dumpAllText(reader);
238 reader.end();
239 }
240 // Example 2. Extract text content based on the
241 // selection rectangle.
242 console.log('----------------------------------------------------');
243 console.log('Extract text based on the selection rectangle.');
244 console.log('----------------------------------------------------');
245
246 const firstPage = await (await doc.getPageIterator()).current();
247 let s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(27, 392, 563, 534), reader);
248 console.log('Field 1: ' + s1);
249
250 s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(28, 551, 106, 623), reader);
251 console.log('Field 2: ' + s1);
252
253 s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(208, 550, 387, 621), reader);
254 console.log('Field 3: ' + s1);
255
256 // ...
257 console.log('Done');
258 await PDFNet.endDeallocateStack();
259 } catch (err) {
260 console.log(err.stack);
261 }
262 }
263 };
264 // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
265 PDFNet.runWithCleanup(main);
266 };
267})(window);
268// eslint-disable-next-line spaced-comment
269//# sourceURL=TextExtractTest.js

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales