Text Extraction

Sample JavaScript code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our full PDF Data Extraction SDK Capabilities.
To start your free trial, get stated with WebViewer.
This sample works with Full-API for WebViewer.
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6(exports => {
7  const PDFNet = exports.Core.PDFNet;
8
9  exports.runTextExtractTest = async () => {
10    // A utility method used to dump all text content in the console window.
11    const dumpAllText = async reader => {
12      let element;
13      let bbox;
14      let arr;
15      while ((element = await reader.next()) !== null) {
16        switch (await element.getType()) {
17          case PDFNet.Element.Type.e_text_begin:
18            console.log('--> Text Block Begin');
19            break;
20          case PDFNet.Element.Type.e_text_end:
21            console.log('--> Text Block End');
22            break;
23          case PDFNet.Element.Type.e_text:
24            bbox = await element.getBBox();
25            console.log('--> BBox: ' + bbox.x1 + ', ' + bbox.y1 + ', ' + bbox.x2 + ', ' + bbox.y2 + '\n');
26            arr = await element.getTextString();
27            console.log(arr);
28            break;
29          case PDFNet.Element.Type.e_text_new_line:
30            break;
31          case PDFNet.Element.Type.e_form:
32            reader.formBegin();
33            await dumpAllText(reader);
34            reader.end();
35            break;
36        }
37      }
38    };
39
40    // helper method for ReadTextFromRect
41    const rectTextSearch = async (reader, pos, srchStr) => {
42      let element;
43      let arr;
44      while ((element = await reader.next()) !== null) {
45        let bbox;
46        switch (await element.getType()) {
47          case PDFNet.Element.Type.e_text:
48            bbox = await element.getBBox();
49            if (await bbox.intersectRect(bbox, pos)) {
50              arr = await element.getTextString();
51              srchStr += arr + '\n';
52            }
53            break;
54          case PDFNet.Element.Type.e_text_new_line:
55            break;
56          case PDFNet.Element.Type.e_form:
57            reader.formBegin();
58            srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
59            reader.end();
60            break;
61        }
62      }
63      return srchStr;
64    };
65
66    const readTextFromRect = async (page, pos, reader) => {
67      let srchStr = '';
68      reader.beginOnPage(page); // uses default parameters.
69      srchStr += await rectTextSearch(reader, pos, srchStr);
70      reader.end();
71      return srchStr;
72    };
73
74    const printStyle = async s => {
75      const rgb = await s.getColor();
76      const rColorVal = await rgb.get(0);
77      const gColorVal = await rgb.get(1);
78      const bColorVal = await rgb.get(2);
79      const fontName = await s.getFontName();
80      const fontSize = await s.getFontSize();
81      const serifOutput = (await s.isSerif()) ? ' sans-serif; ' : ' ';
82      const returnString =
83        'style="font-family:' + fontName + ';font-size:' + fontSize + ';' + serifOutput + 'color: #' + rColorVal.toString(16) + ', ' + gColorVal.toString(16) + ', ' + bColorVal.toString(16) + ')"';
84      return returnString;
85    };
86
87    const main = async () => {
88      console.log('Beginning Test');
89
90      // Relative path to the folder containing test files.
91      const inputURL = '../TestFiles/';
92      const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
93
94      const example1Basic = false;
95      const example2XML = false;
96      const example3Wordlist = false;
97      const example4Advanced = true;
98      const example5LowLevel = false;
99      let doc = null;
100
101      try {
102        await PDFNet.startDeallocateStack();
103        doc = await PDFNet.PDFDoc.createFromURL(inputURL + inputFilename);
104        doc.initSecurityHandler();
105        doc.lock();
106
107        const page = await doc.getPage(1);
108
109        if (page.id === '0') {
110          console.log('Page not found.');
111          return 1;
112        }
113
114        const txt = await PDFNet.TextExtractor.create();
115        const rect = new PDFNet.Rect(0, 0, 612, 794);
116        txt.begin(page, rect);
117
118        let text;
119        let line;
120        let word;
121
122        if (example1Basic) {
123          const wordCount = await txt.getWordCount();
124          console.log('Word Count: ' + wordCount);
125          text = await txt.getAsText();
126          console.log('- GetAsText  -------------------------------');
127          console.log(text);
128          console.log('-----------------------------------------');
129        }
130
131        if (example2XML) {
132          text = await txt.getAsXML(
133            PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info
134          );
135          console.log('- GetAsXML  --------------------------' + text);
136          console.log('-----------------------------------------------------------');
137        }
138
139        if (example3Wordlist) {
140          line = await txt.getFirstLine();
141          for (; await line.isValid(); line = await line.getNextLine()) {
142            for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
143              text = await word.getString();
144              console.log(text);
145            }
146          }
147          console.log('-----------------------------------------------------------');
148        }
149
150        if (example4Advanced) {
151          let b;
152          let q;
153          let curFlowID = -1;
154          let curParaID = -1;
155
156          for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
157            if ((await line.getNumWords()) === 0) {
158              continue;
159            }
160            if ((await line.getFlowID()) !== curFlowID) {
161              if (curFlowID !== -1) {
162                if (curParaID !== -1) {
163                  curParaID = -1;
164                  console.log('</Para>');
165                }
166                console.log('</Flow>');
167              }
168              curFlowID = await line.getFlowID();
169              console.log('<Flow id="' + curFlowID + '">');
170            }
171            if ((await line.getParagraphID()) !== curParaID) {
172              if (curParaID !== -1) {
173                console.log('</Para>');
174              }
175              curParaID = await line.getParagraphID();
176              console.log('<Para id="' + curParaID + '">');
177            }
178            b = await line.getBBox();
179            const lineStyle = await line.getStyle();
180            let outputStringLineBox = '<Line box="' + b.x1 + ', ' + b.y1 + ', ' + b.x2 + ', ' + b.y1 + '">';
181            outputStringLineBox += await printStyle(lineStyle);
182            const currentLineNum = await line.getCurrentNum();
183            outputStringLineBox += ' cur_num="' + currentLineNum + '">';
184            console.log(outputStringLineBox);
185
186            // For each word in the line...
187            let outputStringWord = '';
188            for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
189              // output bounding box for the word
190              q = await word.getBBox();
191              const currentNum = await word.getCurrentNum();
192              outputStringWord += '<Word box="' + q.x1 + ', ' + q.y1 + ', ' + q.x2 + ', ' + q.y2 + '" cur_num="' + currentNum + '"';
193              const sz = await word.getStringLen();
194              if (sz === 0) {
195                continue;
196              }
197              // if the word style is different from the parent style, output the new style
198              const sty = await word.getStyle();
199              if (!(await sty.compare(lineStyle))) {
200                console.log(await printStyle(sty));
201              }
202              outputStringWord += '>' + (await word.getString()) + '</Word>';
203              console.log(outputStringWord);
204            }
205            console.log('</Line>');
206          }
207          if (curFlowID !== -1) {
208            if (curParaID !== -1) {
209              curParaID = -1;
210              console.log('</Para>');
211            }
212            console.log('</Flow>\n');
213          }
214        }
215        console.log('done');
216        await PDFNet.endDeallocateStack();
217      } catch (err) {
218        console.log(err);
219        console.log(err.stack);
220      }
221
222      if (example5LowLevel) {
223        try {
224          await PDFNet.startDeallocateStack();
225          doc = await PDFNet.PDFDoc.createFromURL(inputURL + inputFilename);
226          doc.initSecurityHandler();
227          doc.lock();
228
229          // Example 1. Extract all text content from the document
230          const reader = await PDFNet.ElementReader.create();
231          const itr = await doc.getPageIterator(1);
232
233          //  Read every page
234          for (itr; await itr.hasNext(); itr.next()) {
235            const page = await itr.current();
236            reader.beginOnPage(page);
237            await dumpAllText(reader);
238            reader.end();
239          }
240          // Example 2. Extract text content based on the
241          // selection rectangle.
242          console.log('----------------------------------------------------');
243          console.log('Extract text based on the selection rectangle.');
244          console.log('----------------------------------------------------');
245
246          const firstPage = await (await doc.getPageIterator()).current();
247          let s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(27, 392, 563, 534), reader);
248          console.log('Field 1: ' + s1);
249
250          s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(28, 551, 106, 623), reader);
251          console.log('Field 2: ' + s1);
252
253          s1 = await readTextFromRect(firstPage, await PDFNet.Rect.init(208, 550, 387, 621), reader);
254          console.log('Field 3: ' + s1);
255
256          // ...
257          console.log('Done');
258          await PDFNet.endDeallocateStack();
259        } catch (err) {
260          console.log(err.stack);
261        }
262      }
263    };
264    // add your own license key as the second parameter, e.g. PDFNet.runWithCleanup(main, 'YOUR_LICENSE_KEY')
265    PDFNet.runWithCleanup(main);
266  };
267})(window);
268// eslint-disable-next-line spaced-comment
269//# sourceURL=TextExtractTest.js
Did you find this helpful?
Trial setup questions?
Ask experts on Discord
Need other help?
Contact Support
Pricing or product questions?
Contact Sales
Product:

Text Extraction