TextExtraction

Sample JavaScript code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Web SDK and PDF Data Extraction SDK Capabilities.

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6(exports => {
7
8
9
10
11
12
13 const PDFNet = exports.Core.PDFNet;
14
15 exports.runTextExtractTest = async () => {
16 // A utility method used to dump all text content in the console window.
17 const dumpAllText = async reader => {
18 let element;
19 let bbox;
20 let arr;
21 while ((element = await reader.next()) !== null) {
22 switch (await element.getType()) {
23 case PDFNet.Element.Type.e_text_begin:
24 console.log('--> Text Block Begin');
25 break;
26 case PDFNet.Element.Type.e_text_end:
27 console.log('--> Text Block End');
28 break;
29 case PDFNet.Element.Type.e_text:
30 bbox = await element.getBBox();
31 console.log('--> BBox: ' + bbox.x1 + ', ' + bbox.y1 + ', ' + bbox.x2 + ', ' + bbox.y2 + '\n');
32 arr = await element.getTextString();
33 console.log(arr);
34 break;
35 case PDFNet.Element.Type.e_text_new_line:
36 break;
37 case PDFNet.Element.Type.e_form:
38 reader.formBegin();
39 await dumpAllText(reader);
40 reader.end();
41 break;
42 }
43 }
44 };
45
46 // helper method for ReadTextFromRect
47 const rectTextSearch = async (reader, pos, srchStr) => {
48 let element;
49 let arr;
50 while ((element = await reader.next()) !== null) {
51 let bbox;
52 switch (await element.getType()) {
53 case PDFNet.Element.Type.e_text:
54 bbox = await element.getBBox();
55 if (await bbox.intersectRect(bbox, pos)) {
56 arr = await element.getTextString();
57 srchStr += arr + '\n';
58 }
59 break;
60 case PDFNet.Element.Type.e_text_new_line:
61 break;
62 case PDFNet.Element.Type.e_form:
63 reader.formBegin();
64 srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
65 reader.end();
66 break;
67 }
68 }
69 return srchStr;
70 };
71
72 const readTextFromRect = async (page, pos, reader) => {
73 let srchStr = '';
74 reader.beginOnPage(page); // uses default parameters.
75 srchStr += await rectTextSearch(reader, pos, srchStr);
76 reader.end();
77 return srchStr;
78 };
79
80 const printStyle = async s => {
81 const rgb = await s.getColor();
82 const rColorVal = await rgb.get(0);
83 const gColorVal = await rgb.get(1);
84 const bColorVal = await rgb.get(2);
85 const fontName = await s.getFontName();
86 const fontSize = await s.getFontSize();
87 const serifOutput = (await s.isSerif()) ? ' sans-serif; ' : ' ';
88 const returnString =
89 'style="font-family:' + fontName + ';font-size:' + fontSize + ';' + serifOutput + 'color: #' + rColorVal.toString(16) + ', ' + gColorVal.toString(16) + ', ' + bColorVal.toString(16) + ')"';
90 return returnString;
91 };
92
93 const main = async () => {
94 console.log('Beginning Test');
95
96 // Relative path to the folder containing test files.
97 const inputURL = '../TestFiles/';
98 const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
99
100 const example1Basic = false;
101 const example2XML = false;
102 const example3Wordlist = false;
103 const example4Advanced = true;
104 const example5LowLevel = false;
105 let doc = null;
106
107 try {
108 await PDFNet.startDeallocateStack();
109 doc = await PDFNet.PDFDoc.createFromURL(inputURL + inputFilename);
110 doc.initSecurityHandler();
111 doc.lock();
112
113 const page = await doc.getPage(1);
114
115 if (page.id === '0') {
116 console.log('Page not found.');
117 return 1;
118 }
119
120 const txt = await PDFNet.TextExtractor.create();
121 const rect = new PDFNet.Rect(0, 0, 612, 794);
122 txt.begin(page, rect);
123
124 let text;
125 let line;
126 let word;
127
128 if (example1Basic) {
129 const wordCount = await txt.getWordCount();
130 console.log('Word Count: ' + wordCount);
131 text = await txt.getAsText();
132 console.log('- GetAsText -------------------------------');
133 console.log(text);
134 console.log('-----------------------------------------');
135 }
136
137 if (example2XML) {
138 text = await txt.getAsXML(
139 PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info
140 );
141 console.log('- GetAsXML --------------------------' + text);
142 console.log('-----------------------------------------------------------');
143 }
144
145 if (example3Wordlist) {
146 line = await txt.getFirstLine();
147 for (; await line.isValid(); line = await line.getNextLine()) {
148 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
149 text = await word.getString();
150 console.log(text);
151 }
152 }
153 console.log('-----------------------------------------------------------');
154 }
155
156 if (example4Advanced) {
157 let b;
158 let q;
159 let curFlowID = -1;
160 let curParaID = -1;
161
162 for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
163 if ((await line.getNumWords()) === 0) {
164 continue;
165 }
166 if ((await line.getFlowID()) !== curFlowID) {
167 if (curFlowID !== -1) {
168 if (curParaID !== -1) {
169 curParaID = -1;
170 console.log('</Para>');
171 }
172 console.log('</Flow>');
173 }
174 curFlowID = await line.getFlowID();
175 console.log('<Flow id="' + curFlowID + '">');
176 }
177 if ((await line.getParagraphID()) !== curParaID) {
178 if (curParaID !== -1) {
179 console.log('</Para>');
180 }
181 curParaID = await line.getParagraphID();
182 console.log('<Para id="' + curParaID + '">');
183 }
184 b = await line.getBBox();
185 const lineStyle = await line.getStyle();
186 let outputStringLineBox = '<Line box="' + b.x1 + ', ' + b.y1 + ', ' + b.x2 + ', ' + b.y1 + '">';
187 outputStringLineBox += await printStyle(lineStyle);
188 const currentLineNum = await line.getCurrentNum();
189 outputStringLineBox += ' cur_num="' + currentLineNum + '">';
190 console.log(outputStringLineBox);
191
192 // For each word in the line...
193 let outputStringWord = '';
194 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
195 // output bounding box for the word
196 q = await word.getBBox();
197 const currentNum = await word.getCurrentNum();
198 outputStringWord += '<Word box="' + q.x1 + ', ' + q.y1 + ', ' + q.x2 + ', ' + q.y2 + '" cur_num="' + currentNum + '"';
199 const sz = await word.getStringLen();
200 if (sz === 0) {
201 continue;
202 }
203 // if the word style is different from the parent style, output the new style
204 const sty = await word.getStyle();
205 if (!(await sty.compare(lineStyle))) {
206 console.log(await printStyle(sty));
207 }
208 outputStringWord += '>' + (await word.getString()) + '</Word>';
209 console.log(outputStringWord);
210 }
211 console.log('</Line>');
212 }
213 if (curFlowID !== -1) {
214 if (curParaID !== -1) {
215 curParaID = -1;
216 console.log('</Para>');
217 }
218 console.log('</Flow>\n');
219 }
220 }
221 console.log('done');
222 await PDFNet.endDeallocateStack();
223 } catch (err) {
224 console.log(err);
225 console.log(err.stack);
226 }
227
228 if (example5LowLevel) {
229 try {
230 await PDFNet.startDeallocateStack();
231 doc = await PDFNet.PDFDoc.createFromURL(inputURL + inputFilename);
232 doc.initSecurityHandler();
233 doc.lock();
234
235 // Example 1. Extract all text content from the document
236 const reader = await PDFNet.ElementReader.create();
237 const itr = await doc.getPageIterator(1);
238
239 // Read every page
240 for (itr; await itr.hasNext(); itr.next()) {
241 const page = await itr.current();
242 reader.beginOnPage(page);
243 await dumpAllText(reader);
244 reader.end();
245 }
246 // Example 2. Extract text content based on the
247 // selection rectangle.
248 console.log('----------------------------------------------------');
249 console.log('Extract text based on the selection rectangle.');
250 console.log('----------------------------------------------------');
251
252 const firstPage = await (await doc.getPageIterator()).current();
253 let s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(27, 392, 563, 534), reader);
254 console.log('Field 1: ' + s1);
255
256 s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(28, 551, 106, 623), reader);
257 console.log('Field 2: ' + s1);
258
259 s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(208, 550, 387, 621), reader);
260 console.log('Field 3: ' + s1);
261
262 // ...
263 console.log('Done');
264 await PDFNet.endDeallocateStack();
265 } catch (err) {
266 console.log(err.stack);
267 }
268 }
269 };
270 // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
271 PDFNet.runWithCleanup(main);
272 };
273})(window);
274// eslint-disable-next-line spaced-comment
275//# sourceURL=TextExtractTest.js

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales