Sample C# code for using Apryse SDK to read a PDF (parse and extract text). If you'd like to search text on PDF pages, see our code sample for text search. Learn more about our Server SDK and PDF Data Extraction SDK Capabilities.
1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using System.Drawing;
7using pdftron;
8using pdftron.Common;
9using pdftron.Filters;
10using pdftron.SDF;
11using pdftron.PDF;
12
13
14namespace TextExtractTestCS
15{
16 // This sample illustrates various text extraction capabilities of PDFNet.
17
18 class Class1
19 {
20 private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
21 static Class1() {}
22
23 static void Main(string[] args)
24 {
25 PDFNet.Initialize(PDFTronLicense.Key);
26
27 // Relative path to the folder containing test files.
28 string input_path = "../../../../TestFiles/";
29
30 bool example1_basic = false;
31 bool example2_xml = false;
32 bool example3_wordlist = false;
33 bool example4_advanced = true;
34 bool example5_low_level = false;
35
36 // Sample code showing how to use high-level text extraction APIs.
37 try
38 {
39 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
40 {
41 doc.InitSecurityHandler();
42
43 Page page = doc.GetPage(1);
44 if (page == null) {
45 Console.WriteLine("Page not found.");
46 return;
47 }
48
49 using (TextExtractor txt = new TextExtractor())
50 {
51 txt.Begin(page); // Read the page.
52 // Other options you may want to consider...
53 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
54 // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
55 // ...
56
57 // Example 1. Get all text on the page in a single string.
58 // Words will be separated with space or new line characters.
59 if (example1_basic)
60 {
61 // Get the word count.
62 Console.WriteLine("Word Count: {0}", txt.GetWordCount());
63
64 Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
65 Console.WriteLine("-----------------------------------------------------------");
66 }
67
68 // Example 2. Get XML logical structure for the page.
69 if (example2_xml)
70 {
71 String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
72 Console.WriteLine("\n\n- GetAsXML --------------------------\n{0}", text);
73 Console.WriteLine("-----------------------------------------------------------");
74 }
75
76 // Example 3. Extract words one by one.
77 if (example3_wordlist)
78 {
79 TextExtractor.Word word;
80 for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
81 {
82 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
83 {
84 Console.WriteLine(word.GetString());
85 }
86 }
87 Console.WriteLine("-----------------------------------------------------------");
88 }
89
90 // Example 3. A more advanced text extraction example.
91 // The output is XML structure containing paragraphs, lines, words,
92 // as well as style and positioning information.
93 if (example4_advanced)
94 {
95 Rect bbox;
96 int cur_flow_id=-1, cur_para_id=-1;
97
98 TextExtractor.Line line;
99 TextExtractor.Word word;
100 TextExtractor.Style s, line_style;
101
102 Console.WriteLine("<PDFText>");
103 // For each line on the page...
104 for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
105 {
106 if (line.GetNumWords() == 0)
107 {
108 continue;
109 }
110
111 if (cur_flow_id != line.GetFlowID()) {
112 if (cur_flow_id != -1) {
113 if (cur_para_id != -1) {
114 cur_para_id = -1;
115 Console.WriteLine("</Para>");
116 }
117 Console.WriteLine("</Flow>");
118 }
119 cur_flow_id = line.GetFlowID();
120 Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
121 }
122
123 if (cur_para_id != line.GetParagraphID()) {
124 if (cur_para_id != -1)
125 Console.WriteLine("</Para>");
126 cur_para_id = line.GetParagraphID();
127 Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
128 }
129
130 bbox = line.GetBBox();
131 line_style = line.GetStyle();
132 Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
133 PrintStyle(line_style);
134 Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");
135
136 // For each word in the line...
137 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
138 {
139 // Output the bounding box for the word.
140 bbox = word.GetBBox();
141 Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
142 Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
143 int sz = word.GetStringLen();
144 if (sz == 0) continue;
145
146 // If the word style is different from the parent style, output the new style.
147 s = word.GetStyle();
148 if (s != line_style) {
149 PrintStyle(s);
150 }
151
152 Console.Write(">{0}", word.GetString());
153 Console.WriteLine("</Word>");
154 }
155 Console.WriteLine("</Line>");
156 }
157
158 if (cur_flow_id != -1) {
159 if (cur_para_id != -1) {
160 cur_para_id = -1;
161 Console.WriteLine("</Para>");
162 }
163 Console.WriteLine("</Flow>");
164 }
165 }
166
167 }
168 Console.WriteLine("</PDFText>");
169 }
170 }
171 catch (PDFNetException e)
172 {
173 Console.WriteLine(e.Message);
174 }
175
176 // Sample code showing how to use low-level text extraction APIs.
177 if (example5_low_level)
178 {
179 try
180 {
181 LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
182 using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
183 {
184 doc.InitSecurityHandler();
185
186 // Example 1. Extract all text content from the document
187 using (ElementReader reader = new ElementReader())
188 {
189 PageIterator itr = doc.GetPageIterator();
190 //for (; itr.HasNext(); itr.Next()) // Read every page
191 {
192 reader.Begin(itr.Current());
193 LowLevelTextExtractUtils.DumpAllText(reader);
194 reader.End();
195 }
196
197 // Example 2. Extract text based on the selection rectangle.
198 Console.WriteLine("----------------------------------------------------");
199 Console.WriteLine("Extract text based on the selection rectangle.");
200 Console.WriteLine("----------------------------------------------------");
201
202 Page first_page = doc.GetPage(1);
203 string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
204 string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
205 string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
206
207 Console.WriteLine("Field 1: {0}", field1);
208 Console.WriteLine("Field 2: {0}", field2);
209 Console.WriteLine("Field 3: {0}", field3);
210 // ...
211
212 Console.WriteLine("Done.");
213 }
214 }
215 }
216 catch (PDFNetException e)
217 {
218 Console.WriteLine(e.Message);
219 }
220 }
221 PDFNet.Terminate();
222 }
223
224 static void PrintStyle(TextExtractor.Style s) {
225 Color rgb = s.GetColor();
226 String rgb_hex = String.Format("{0:X02}{1:X02}{2:X02};", rgb.R, rgb.G, rgb.B);
227 Console.Write(" style=\"font-family:{0}; font-size:{1};{2} color:#{3}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif;" : ""), rgb_hex);
228 }
229 }
230
231 class LowLevelTextExtractUtils
232 {
233 // A utility method used to dump all text content in the
234 // console window.
235 public static void DumpAllText(ElementReader reader)
236 {
237 Element element;
238 while ((element = reader.Next()) != null)
239 {
240 switch (element.GetType())
241 {
242 case Element.Type.e_text_begin:
243 Console.WriteLine("\n--> Text Block Begin");
244 break;
245 case Element.Type.e_text_end:
246 Console.WriteLine("\n--> Text Block End");
247 break;
248 case Element.Type.e_text:
249 {
250 Rect bbox = new Rect();
251 element.GetBBox(bbox);
252 // Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
253
254 String txt = element.GetTextString();
255 Console.Write(txt);
256 Console.WriteLine("");
257 break;
258 }
259 case Element.Type.e_text_new_line:
260 {
261 // Console.WriteLine("\n--> New Line");
262 break;
263 }
264 case Element.Type.e_form: // Process form XObjects
265 {
266 reader.FormBegin();
267 DumpAllText(reader);
268 reader.End();
269 break;
270 }
271 }
272 }
273 }
274
275
276 private string _srch_str;
277
278 // A helper method for ReadTextFromRect
279 void RectTextSearch(ElementReader reader, Rect pos)
280 {
281 Element element;
282 while ((element = reader.Next()) != null)
283 {
284 switch (element.GetType())
285 {
286 case Element.Type.e_text:
287 {
288 Rect bbox = new Rect();
289 element.GetBBox(bbox);
290 if(bbox.IntersectRect(bbox, pos))
291 {
292 _srch_str += element.GetTextString();
293 _srch_str += "\n"; // add a new line?
294 }
295 break;
296 }
297 case Element.Type.e_text_new_line:
298 {
299 break;
300 }
301 case Element.Type.e_form: // Process form XObjects
302 {
303 reader.FormBegin();
304 RectTextSearch(reader, pos);
305 reader.End();
306 break;
307 }
308 }
309 }
310 }
311
312 // A utility method used to extract all text content from
313 // a given selection rectangle. The rectangle coordinates are
314 // expressed in PDF user/page coordinate system.
315 public string ReadTextFromRect(Page page, Rect pos, ElementReader reader)
316 {
317 _srch_str = "";
318 reader.Begin(page);
319 RectTextSearch(reader, pos);
320 reader.End();
321 return _srch_str;
322 }
323 }
324}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Element.h>
10#include <PDF/Font.h>
11#include <Filters/FilterReader.h>
12#include <PDF/Image/Image2RGB.h>
13#include <PDF/TextExtractor.h>
14
15// This sample illustrates the basic text extraction capabilities of PDFNet.
16
17#include <iostream>
18#include "../../LicenseKey/CPP/LicenseKey.h"
19
20using namespace std;
21
22using namespace pdftron;
23using namespace PDF;
24using namespace SDF;
25using namespace Common;
26using namespace Filters;
27
28// A utility method used to dump all text content in the console window.
29void DumpAllText(ElementReader& reader)
30{
31 Element element;
32 while ((element = reader.Next()) != 0)
33 {
34 switch (element.GetType())
35 {
36 case Element::e_text_begin:
37 cout << "\n--> Text Block Begin\n";
38 break;
39 case Element::e_text_end:
40 cout << "\n--> Text Block End\n";
41 break;
42 case Element::e_text:
43 {
44 Rect bbox;
45 element.GetBBox(bbox);
46 cout << "\n--> BBox: " << bbox.x1 << ", "
47 << bbox.y1 << ", "
48 << bbox.x2 << ", "
49 << bbox.y2 << "\n";
50
51 UString arr = element.GetTextString();
52 cout << arr << "\n";
53 }
54 break;
55 case Element::e_text_new_line:
56 cout << "\n--> New Line\n";
57 break;
58 case Element::e_form: // Process form XObjects
59 reader.FormBegin();
60 DumpAllText(reader);
61 reader.End();
62 break;
63 }
64 }
65}
66
67// A helper method for ReadTextFromRect
68void RectTextSearch(ElementReader& reader, const Rect& pos, UString& srch_str)
69{
70 Element element;
71 while (element = reader.Next())
72 {
73 switch (element.GetType())
74 {
75 case Element::e_text:
76 {
77 Rect bbox;
78 element.GetBBox(bbox);
79 if(bbox.IntersectRect(bbox, pos))
80 {
81 UString arr = element.GetTextString();
82 srch_str += arr;
83 srch_str += "\n"; // add a new line?
84 }
85 break;
86 }
87 case Element::e_text_new_line:
88 {
89 break;
90 }
91 case Element::e_form: // Process form XObjects
92 {
93 reader.FormBegin();
94 RectTextSearch(reader, pos, srch_str);
95 reader.End();
96 break;
97 }
98 }
99 }
100}
101
102// A utility method used to extract all text content from
103// a given selection rectangle. The rectangle coordinates are
104// expressed in PDF user/page coordinate system.
105UString ReadTextFromRect(Page& page, const Rect& pos, ElementReader& reader)
106{
107 UString srch_str;
108 reader.Begin(page);
109 RectTextSearch(reader, pos, srch_str);
110 reader.End();
111 return srch_str;
112}
113
114
115void PrintStyle(TextExtractor::Style& s)
116{
117 UInt8 rgb[3];
118 char rgb_hex[24];
119
120 s.GetColor(rgb);
121 sprintf(rgb_hex, "%02X%02X%02X;", rgb[0], rgb[1], rgb[2]);
122 cout << " style=\"font-family:" << s.GetFontName() << "; " << "font-size:" << s.GetFontSize() << ";"
123 << (s.IsSerif() ? " sans-serif; " : " ") << "color:#" << rgb_hex << "\"";
124}
125
126int main(int argc, char *argv[])
127{
128 int ret = 0;
129 PDFNet::Initialize(LicenseKey);
130 // Relative path to the folder containing test files.
131 string input_path = "../../TestFiles/newsletter.pdf";
132
133
134
135
136 const char* filein = argc>1 ? argv[1] : input_path.c_str();
137
138 bool example1_basic = false;
139 bool example2_xml = false;
140 bool example3_wordlist = false;
141 bool example4_advanced = true;
142 bool example5_low_level = false;
143
144 // Sample code showing how to use high-level text extraction APIs.
145 try
146 {
147 PDFDoc doc(filein);
148 doc.InitSecurityHandler();
149
150 Page page = doc.GetPage(1);
151 if (!page){
152 cout << "Page not found." << endl;
153 return 1;
154 }
155
156 TextExtractor txt;
157 txt.Begin(page); // Read the page.
158 // Other options you may want to consider...
159 // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
160 // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
161
162
163 // Example 1. Get all text on the page in a single string.
164 // Words will be separated with space or new line characters.
165 if (example1_basic)
166 {
167 // Get the word count.
168 cout << "Word Count: " << txt.GetWordCount() << endl;
169
170 UString text;
171 txt.GetAsText(text);
172 cout << "\n\n- GetAsText --------------------------\n" << text << endl;
173 cout << "-----------------------------------------------------------" << endl;
174 }
175
176 // Example 2. Get XML logical structure for the page.
177 if (example2_xml)
178 {
179 UString text;
180 txt.GetAsXML(text, TextExtractor::e_words_as_elements | TextExtractor::e_output_bbox | TextExtractor::e_output_style_info);
181 cout << "\n\n- GetAsXML --------------------------\n" << text << endl;
182 cout << "-----------------------------------------------------------" << endl;
183 }
184
185 // Example 3. Extract words one by one.
186 if (example3_wordlist)
187 {
188 UString text;
189 TextExtractor::Line line = txt.GetFirstLine();
190 TextExtractor::Word word;
191 for (; line.IsValid(); line=line.GetNextLine()) {
192 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord()) {
193 text.Assign(word.GetString(), word.GetStringLen());
194 cout << text << '\n';
195 }
196 }
197 cout << "-----------------------------------------------------------" << endl;
198 }
199
200 // Example 4. A more advanced text extraction example.
201 // The output is XML structure containing paragraphs, lines, words,
202 // as well as style and positioning information.
203 if (example4_advanced)
204 {
205 const double *b;
206 double q[8];
207 int cur_flow_id=-1, cur_para_id=-1;
208
209 UString uni_str;
210 TextExtractor::Line line;
211 TextExtractor::Word word;
212 TextExtractor::Style s, line_style;
213
214 cout << "<PDFText>\n";
215
216 // For each line on the page...
217 for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
218 {
219 if ( line.GetNumWords() == 0 ) {
220 continue;
221 }
222
223 if (cur_flow_id != line.GetFlowID()) {
224 if (cur_flow_id != -1) {
225 if (cur_para_id != -1) {
226 cur_para_id = -1;
227 cout << "</Para>\n";
228 }
229 cout << "</Flow>\n";
230 }
231 cur_flow_id = line.GetFlowID();
232 cout << "<Flow id=\""<< cur_flow_id << "\">\n";
233 }
234
235 if (cur_para_id != line.GetParagraphID()) {
236 if (cur_para_id != -1)
237 cout << "</Para>\n";
238 cur_para_id = line.GetParagraphID();
239 cout << "<Para id=\""<< cur_para_id << "\">\n";
240 }
241
242 b = line.GetBBox();
243 line_style = line.GetStyle();
244 printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", b[0], b[1], b[2], b[3]);
245 PrintStyle(line_style);
246 cout << " cur_num=\"" << line.GetCurrentNum() << "\"";
247 cout << ">\n";
248
249 // For each word in the line...
250 for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
251 {
252 // Output the bounding box for the word.
253 word.GetBBox(q);
254 printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", q[0], q[1], q[2], q[3]);
255 cout << " cur_num=\"" << word.GetCurrentNum() << "\"";
256 int sz = word.GetStringLen();
257 if (sz == 0) continue;
258
259 // If the word style is different from the parent style, output the new style.
260 s = word.GetStyle();
261 if (s != line_style) {
262 PrintStyle(s);
263 }
264
265 uni_str.Assign(word.GetString(), sz);
266 cout << ">" << uni_str;
267 cout << "</Word>\n";
268 }
269 cout << "</Line>\n";
270 }
271
272 if (cur_flow_id != -1) {
273 if (cur_para_id != -1) {
274 cur_para_id = -1;
275 cout << "</Para>\n";
276 }
277 cout << "</Flow>\n";
278 }
279 cout << "</PDFText>\n";
280 }
281 }
282 catch(Exception& e)
283 {
284 cout << e << endl;
285 ret = 1;
286 }
287 catch(...)
288 {
289 cout << "Unknown Exception" << endl;
290 ret = 1;
291 }
292
293
294 if(example5_low_level)
295 {
296 try
297 {
298 PDFDoc doc(filein);
299 doc.InitSecurityHandler();
300
301 // Example 1. Extract all text content from the document
302
303 ElementReader reader;
304 // Read every page
305 for (PageIterator itr=doc.GetPageIterator(); itr.HasNext(); itr.Next())
306 {
307 reader.Begin(itr.Current());
308 DumpAllText(reader);
309 reader.End();
310 }
311
312 // Example 2. Extract text content based on the
313 // selection rectangle.
314 cout << "\n----------------------------------------------------";
315 cout << "\nExtract text based on the selection rectangle.";
316 cout << "\n----------------------------------------------------\n";
317
318 Page first_page = doc.GetPageIterator().Current();
319 UString s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader);
320 cout << "\nField 1: " << s1;
321
322 s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
323 cout << "\nField 2: " << s1;
324
325 s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
326 cout << "\nField 3: " << s1;
327
328 // ...
329 cout << "Done." << endl;
330 }
331 catch(Exception& e)
332 {
333 cout << e << endl;
334 ret = 1;
335 }
336 catch(...)
337 {
338 cout << "Unknown Exception" << endl;
339 ret = 1;
340 }
341 }
342 PDFNet::Terminate();
343 return ret;
344}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8 "fmt"
9 "strconv"
10 "os"
11 . "pdftron"
12)
13
14import "pdftron/Samples/LicenseKey/GO"
15
16func PrintStyle (style Style){
17 sansSerifStr := ""
18 if style.IsSerif(){
19 sansSerifStr = " sans-serif;"
20 }
21 rgb := style.GetColor()
22 rgbHex := fmt.Sprintf("%02X%02X%02X;", rgb.Get(0), rgb.Get(1), rgb.Get(2))
23 fontStr := fmt.Sprintf("%g", style.GetFontSize())
24 os.Stdout.Write([]byte(" style=\"font-family:" + style.GetFontName() + "; font-size:" + fontStr + ";" + sansSerifStr + " color:#" + rgbHex + "\""))
25}
26
27func DumpAllText (reader ElementReader){
28 element := reader.Next()
29
30 for element.GetMp_elem().Swigcptr() != 0{
31 etype := element.GetType()
32 if etype == ElementE_text_begin{
33 fmt.Println("Text Block Begin")
34 }else if etype == ElementE_text_end{
35 fmt.Println("Text Block End")
36 }else if etype == ElementE_text{
37 bbox := element.GetBBox()
38 fmt.Println("BBox: " + fmt.Sprintf("%f", bbox.GetX1()) + ", " + fmt.Sprintf("%f", bbox.GetY1()) + ", " +
39 fmt.Sprintf("%f", bbox.GetX2()) + ", " + fmt.Sprintf("%f", bbox.GetY2()))
40 textString := element.GetTextString()
41 fmt.Println(textString)
42 }else if etype == ElementE_text_new_line{
43 fmt.Println("New Line")
44 }else if etype == ElementE_form{
45 reader.FormBegin()
46 DumpAllText(reader)
47 reader.End()
48 }
49 element = reader.Next()
50 }
51}
52
53// A utility method used to extract all text content from
54// a given selection rectangle. The recnagle coordinates are
55// expressed in PDF user/page coordinate system.
56func ReadTextFromRect (page Page, pos Rect, reader ElementReader) string{
57 reader.Begin(page)
58 srchStr := RectTextSearch(reader, pos)
59 reader.End()
60 return srchStr
61}
62//A helper method for ReadTextFromRect
63func RectTextSearch (reader ElementReader, pos Rect) string{
64 element := reader.Next()
65 srchStr2 := ""
66 for element.GetMp_elem().Swigcptr() != 0{
67 etype := element.GetType()
68 if etype == ElementE_text{
69 bbox := element.GetBBox()
70 if (bbox.IntersectRect(bbox, pos)){
71 arr := element.GetTextString()
72 srchStr2 += arr
73 srchStr2 += "\n"
74 }
75 }else if etype == ElementE_text_new_line{
76 //handle text new line here
77 }else if etype == ElementE_form{
78 reader.FormBegin()
79 srchStr2 += RectTextSearch(reader, pos)
80 fmt.Println(srchStr2)
81 reader.End()
82 }
83 element = reader.Next()
84 }
85 return srchStr2
86}
87
88func main(){
89 PDFNetInitialize(PDFTronLicense.Key)
90
91 // Relative path to the folder containing test files.
92 inputPath := "../../TestFiles/newsletter.pdf"
93 example1Basic := false
94 example2Xml := false
95 example3Wordlist := false
96 example4Advanced := true
97 example5LowLevel := false
98
99 // Sample code showing how to use high-level text extraction APIs.
100 doc := NewPDFDoc(inputPath)
101 doc.InitSecurityHandler()
102
103 page := doc.GetPage(1)
104 if page == nil{
105 fmt.Println("page no found")
106 }
107 txt := NewTextExtractor()
108 txt.Begin(page) // Read the page
109
110 // Example 1. Get all text on the page in a single string.
111 // Words will be separated witht space or new line characters.
112 if example1Basic{
113 fmt.Println("Word count: " + strconv.Itoa(txt.GetWordCount()))
114 txtAsText := txt.GetAsText()
115 fmt.Println("- GetAsText --------------------------" + txtAsText)
116 fmt.Println("-----------------------------------------------------------")
117 }
118 // Example 2. Get XML logical structure for the page.
119 if example2Xml{
120 text := txt.GetAsXML(TextExtractorE_words_as_elements |
121 TextExtractorE_output_bbox |
122 TextExtractorE_output_style_info)
123 fmt.Println("- GetAsXML --------------------------" + text)
124 fmt.Println("-----------------------------------------------------------")
125 }
126 // Example 3. Extract words one by one.
127 if example3Wordlist{
128 word := NewWord()
129 line := txt.GetFirstLine()
130 for line.IsValid(){
131 word = line.GetFirstWord()
132 for word.IsValid(){
133 wordString := word.GetString()
134 fmt.Println(wordString)
135 word = word.GetNextWord()
136 }
137 line = line.GetNextLine()
138 }
139 fmt.Println("-----------------------------------------------------------")
140 }
141 // Example 4. A more advanced text extraction example.
142 // The output is XML structure containing paragraphs, lines, words,
143 // as well as style and positioning information.
144 if example4Advanced{
145 bbox := NewRect()
146 curFlowId := -1
147 curParaId := -1
148
149 fmt.Println("<PDFText>")
150 // For each line on the page...
151 line := txt.GetFirstLine()
152 for line.IsValid(){
153 if line.GetNumWords() == 0{
154 line = line.GetNextLine()
155 continue
156 }
157 word := line.GetFirstWord()
158 if curFlowId != line.GetFlowID(){
159 if curFlowId != -1{
160 if curParaId != -1{
161 curParaId = -1
162 fmt.Println("</Para>")
163 }
164 fmt.Println("</Flow>")
165 }
166 curFlowId = line.GetFlowID()
167 fmt.Println("<Flow id=\"" + strconv.Itoa(curFlowId) +"\">")
168 }
169 if curParaId != line.GetParagraphID(){
170 if curParaId != -1{
171 fmt.Println("</Para>")
172 }
173 curParaId= line.GetParagraphID()
174 fmt.Println("<Para id=\"" +strconv.Itoa(curParaId)+ "\">")
175 }
176 bbox = line.GetBBox()
177 lineStyle := line.GetStyle()
178 os.Stdout.Write([]byte(fmt.Sprintf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2())))
179 PrintStyle (lineStyle)
180 os.Stdout.Write([]byte(" cur_num=\"" + strconv.Itoa(line.GetCurrentNum()) + "\"" + ">\n"))
181
182 // For each word in the line...
183 word = line.GetFirstWord()
184 for word.IsValid(){
185 // Output the bounding box for the word
186 bbox = word.GetBBox()
187 os.Stdout.Write([]byte(fmt.Sprintf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2())))
188 os.Stdout.Write([]byte(" cur_num=\"" + strconv.Itoa(word.GetCurrentNum()) + "\""));
189 sz := word.GetStringLen()
190 if sz == 0{
191 word = word.GetNextWord()
192 continue
193 }
194 // If the word style is different from the parent style, output the new style.
195 s := word.GetStyle()
196 if !s.IsEqual(lineStyle){
197 PrintStyle (s)
198 }
199 wordString := word.GetString()
200 os.Stdout.Write([]byte(">" + wordString + "</Word>\n"))
201 word = word.GetNextWord()
202 }
203 os.Stdout.Write([]byte("</Line>\n"))
204 line = line.GetNextLine()
205 }
206 if curFlowId != -1{
207 if curParaId != -1{
208 curParaId = -1
209 os.Stdout.Write([]byte("</Para>\n"))
210 }
211 os.Stdout.Write([]byte("</Flow>\n"))
212 }
213 txt.Destroy()
214 doc.Close()
215 fmt.Println("</PDFText>")
216 }
217 // Sample code showing how to use low-level text extraction APIs.
218 if example5LowLevel{
219 doc = NewPDFDoc(inputPath)
220 doc.InitSecurityHandler()
221
222 // Example 1. Extract all text content from the document
223
224 reader := NewElementReader()
225 itr := doc.GetPageIterator()
226 for itr.HasNext(){
227 reader.Begin(itr.Current())
228 DumpAllText(reader)
229 reader.End()
230 itr.Next()
231 }
232
233 // Example 2. Extract text content based on the
234 // selection rectangle.
235
236 fmt.Println("----------------------------------------------------")
237 fmt.Println("Extract text based on the selection rectangle.")
238 fmt.Println("----------------------------------------------------")
239
240 itr = doc.GetPageIterator()
241 firstPage := itr.Current()
242 s1 := ReadTextFromRect(firstPage, NewRect(27.0, 392.0, 563.0, 534.0), reader)
243 fmt.Println("Field 1: " + s1)
244
245 s1 = ReadTextFromRect(firstPage, NewRect(28.0, 551.0, 106.0, 623.0), reader);
246 fmt.Println("Field 2: " + s1)
247
248 s1 = ReadTextFromRect(firstPage, NewRect(208.0, 550.0, 387.0, 621.0), reader);
249 fmt.Println("Field 3: " + s1)
250
251 doc.Close()
252 PDFNetTerminate()
253 fmt.Println("Done.")
254 }
255}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.awt.Color;
7import com.pdftron.common.PDFNetException;
8import com.pdftron.pdf.*;
9import java.text.DecimalFormat;
10
11
12// This sample illustrates the basic text extraction capabilities of PDFNet.
13public class TextExtractTest {
14
15 public static void main(String[] args) {
16 PDFNet.initialize(PDFTronLicense.Key());
17
18 // Relative path to the folder containing test files.
19 String input_path = "../../TestFiles/";
20 // string output_path = "../../TestFiles/Output/";
21 boolean example1_basic = false;
22 boolean example2_xml = false;
23 boolean example3_wordlist = false;
24 boolean example4_advanced = true;
25 boolean example5_low_level = false;
26
27 // Sample code showing how to use high-level text extraction APIs.
28 try (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) {
29 doc.initSecurityHandler();
30
31 Page page = doc.getPage(1);
32 if (page == null) {
33 System.out.println("Page not found.");
34 }
35
36 TextExtractor txt = new TextExtractor();
37 txt.begin(page); // Read the page.
38 // Other options you may want to consider...
39 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
40 // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
41 // ...
42
43 // Example 1. Get all text on the page in a single string.
44 // Words will be separated with space or new line characters.
45 if (example1_basic) {
46 // Get the word count.
47 System.out.println("Word Count: " + txt.getWordCount());
48
49 System.out.println("\n\n- GetAsText --------------------------\n" + txt.getAsText());
50 System.out.println("-----------------------------------------------------------");
51 }
52
53 // Example 2. Get XML logical structure for the page.
54 if (example2_xml) {
55 String text = txt.getAsXML(TextExtractor.e_words_as_elements | TextExtractor.e_output_bbox | TextExtractor.e_output_style_info);
56 System.out.println("\n\n- GetAsXML --------------------------\n" + text);
57 System.out.println("-----------------------------------------------------------");
58 }
59
60 // Example 3. Extract words one by one.
61 if (example3_wordlist) {
62 TextExtractor.Word word;
63 for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
64 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
65 System.out.println(word.getString());
66 }
67 }
68 System.out.println("-----------------------------------------------------------");
69 }
70
71 // Example 4. A more advanced text extraction example.
72 // The output is XML structure containing paragraphs, lines, words,
73 // as well as style and positioning information.
74 if (example4_advanced) {
75 Rect bbox;
76 int cur_flow_id = -1, cur_para_id = -1;
77
78 TextExtractor.Line line;
79 TextExtractor.Word word;
80 TextExtractor.Style s, line_style;
81
82 System.out.println("<PDFText>");
83 // For each line on the page...
84 for (line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
85 if (line.getNumWords() == 0)
86 continue;
87 if (cur_flow_id != line.getFlowID()) {
88 if (cur_flow_id != -1) {
89 if (cur_para_id != -1) {
90 cur_para_id = -1;
91 System.out.println("</Para>");
92 }
93 System.out.println("</Flow>");
94 }
95 cur_flow_id = line.getFlowID();
96 System.out.println("<Flow id=\"" + cur_flow_id + "\">");
97 }
98
99 if (cur_para_id != line.getParagraphID()) {
100 if (cur_para_id != -1)
101 System.out.println("</Para>");
102 cur_para_id = line.getParagraphID();
103 System.out.println("<Para id=\"" + cur_para_id + "\">");
104 }
105
106 bbox = line.getBBox();
107 line_style = line.getStyle();
108 System.out.print("<Line box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
109 printStyle(line_style);
110 System.out.println(" cur_num=\"" + line.getCurrentNum() + "\">");
111
112
113 // For each word in the line...
114 for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
115 // Output the bounding box for the word.
116 bbox = word.getBBox();
117 System.out.print("<Word box=\"" + String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
118 System.out.print(" cur_num=\"" + word.getCurrentNum() + "\"");
119 int sz = word.getStringLen();
120 if (sz == 0) continue;
121
122 // If the word style is different from the parent style, output the new style.
123 s = word.getStyle();
124 if (!s.equals(line_style)) {
125 printStyle(s);
126 }
127
128 System.out.print(">" + word.getString());
129 System.out.println("</Word>");
130 }
131 System.out.println("</Line>");
132 }
133
134 if (cur_flow_id != -1) {
135 if (cur_para_id != -1) {
136 cur_para_id = -1;
137 System.out.println("</Para>");
138 }
139 System.out.println("</Flow>");
140 }
141 }
142 txt.destroy();
143 System.out.println("</PDFText>");
144 } catch (PDFNetException e) {
145 System.out.println(e);
146 }
147
148 // Sample code showing how to use low-level text extraction APIs.
149 if (example5_low_level) {
150 try (PDFDoc doc = new PDFDoc((input_path + "newsletter.pdf"))) {
151 doc.initSecurityHandler();
152
153 // Example 1. Extract all text content from the document
154
155 ElementReader reader = new ElementReader();
156 // Read every page
157 for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
158 reader.begin(itr.next());
159 DumpAllText(reader);
160 reader.end();
161 }
162
163 // Example 2. Extract text content based on the
164 // selection rectangle.
165 System.out.print("\n----------------------------------------------------");
166 System.out.print("\nExtract text based on the selection rectangle.");
167 System.out.println("\n----------------------------------------------------");
168
169 Page first_page = doc.getPageIterator().next();
170 String s1 = ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
171 System.out.print("\nField 1: " + s1);
172
173 s1 = ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
174 System.out.print("\nField 2: " + s1);
175
176 s1 = ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
177 System.out.print("\nField 3: " + s1);
178
179 // ...
180 System.out.println("Done.");
181 } catch (Exception e) {
182 e.printStackTrace();
183 }
184 }
185
186 PDFNet.terminate();
187 }
188
189
190 static void printStyle(TextExtractor.Style s) {
191 Color rgb = s.getColor();
192 String rgb_hex = String.format("%02X%02X%02X;", rgb.getRed(), rgb.getGreen(), rgb.getBlue() );
193 DecimalFormat df = new DecimalFormat("#.#");
194 System.out.print(" style=\"font-family:" + s.getFontName() + "; "
195 + "font-size:" + df.format(s.getFontSize()) + ";"
196 + (s.isSerif() ? " sans-serif; " : " ")
197 + "color:#" + rgb_hex + "\"");
198 }
199
200 // A utility method used to dump all text content in the console window.
201 static void DumpAllText(ElementReader reader) throws PDFNetException {
202 Element element;
203 while ((element = reader.next()) != null) {
204 switch (element.getType()) {
205 case Element.e_text_begin:
206 System.out.println("\n--> Text Block Begin");
207 break;
208 case Element.e_text_end:
209 System.out.println("\n--> Text Block End");
210 break;
211 case Element.e_text: {
212 Rect bbox = element.getBBox();
213 if (bbox == null) continue;
214 System.out.println("\n--> BBox: " + bbox.getX1() + ", "
215 + bbox.getY1() + ", "
216 + bbox.getX2() + ", "
217 + bbox.getY2());
218
219 String arr = element.getTextString();
220 System.out.println(arr);
221 }
222 break;
223 case Element.e_text_new_line:
224 System.out.println("\n--> New Line");
225 break;
226 case Element.e_form: // Process form XObjects
227 reader.formBegin();
228 DumpAllText(reader);
229 reader.end();
230 break;
231 }
232 }
233 }
234
235 // A helper method for ReadTextFromRect
236 static String RectTextSearch(ElementReader reader, Rect pos) throws PDFNetException {
237 Element element;
238 String srch_str = new String();
239 while ((element = reader.next()) != null) {
240 switch (element.getType()) {
241 case Element.e_text: {
242 Rect bbox = element.getBBox();
243 if (bbox == null) continue;
244 if (bbox.intersectRect(bbox, pos)) {
245 String arr = element.getTextString();
246 srch_str += arr;
247 srch_str += "\n"; // add a new line?
248 }
249 break;
250 }
251 case Element.e_text_new_line: {
252 break;
253 }
254 case Element.e_form: // Process form XObjects
255 {
256 reader.formBegin();
257 srch_str += RectTextSearch(reader, pos);
258 reader.end();
259 break;
260 }
261 }
262 }
263 return srch_str;
264 }
265
266 // A utility method used to extract all text content from
267 // a given selection rectangle. The rectangle coordinates are
268 // expressed in PDF user/page coordinate system.
269 static String ReadTextFromRect(Page page, Rect pos, ElementReader reader) throws PDFNetException {
270 reader.begin(page);
271 String srch_str = RectTextSearch(reader, pos);
272 reader.end();
273 return srch_str;
274 }
275}
1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11
12 exports.runTextExtractTest = async () => {
13 // A utility method used to dump all text content in the console window.
14 const dumpAllText = async (reader) => {
15 let element;
16 let bbox;
17 let arr;
18 while ((element = await reader.next()) !== null) {
19 switch (await element.getType()) {
20 case PDFNet.Element.Type.e_text_begin:
21 console.log('\n--> Text Block Begin');
22 break;
23 case PDFNet.Element.Type.e_text_end:
24 console.log('\n--> Text Block End');
25 break;
26 case PDFNet.Element.Type.e_text:
27 bbox = await element.getBBox();
28 console.log('\n--> BBox: ' + bbox.x1.toFixed(2) + ', ' + bbox.y1.toFixed(2) + ', ' + bbox.x2.toFixed(2) + ', ' + bbox.y2.toFixed(2) + '\n');
29 arr = await element.getTextString();
30 console.log(arr);
31 break;
32 case PDFNet.Element.Type.e_text_new_line:
33 console.log('\n--> New Line');
34 break;
35 case PDFNet.Element.Type.e_form:
36 reader.formBegin();
37 await dumpAllText(reader);
38 reader.end();
39 break;
40 }
41 }
42 };
43
44 // helper method for ReadTextFromRect
45 const rectTextSearch = async (reader, pos, srchStr) => {
46 let element;
47 let arr;
48 while ((element = await reader.next()) !== null) {
49 let bbox;
50 switch (await element.getType()) {
51 case PDFNet.Element.Type.e_text:
52 bbox = await element.getBBox();
53 if (await bbox.intersectRect(bbox, pos)) {
54 arr = await element.getTextString();
55 srchStr += arr + '\n';
56 }
57 break;
58 case PDFNet.Element.Type.e_text_new_line:
59 break;
60 case PDFNet.Element.Type.e_form:
61 reader.formBegin();
62 srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
63 reader.end();
64 break;
65 }
66 }
67 return srchStr;
68 };
69
70 const readTextFromRect = async (page, pos, reader) => {
71 let srchStr = '';
72 reader.beginOnPage(page); // uses default parameters.
73 srchStr += await rectTextSearch(reader, pos, srchStr);
74 reader.end();
75 return srchStr;
76 };
77
78 const twoDigitHex = function (num) {
79 const hexStr = num.toString(16).toUpperCase();
80 return ('0' + hexStr).substr(-2);
81 }
82
83 const printStyle = async (s) => {
84 const rgb = await s.getColor();
85 const rColorVal = await rgb.get(0);
86 const gColorVal = await rgb.get(1);
87 const bColorVal = await rgb.get(2);
88 const rgbHex = twoDigitHex(rColorVal) + twoDigitHex(gColorVal) + twoDigitHex(bColorVal)
89 const fontName = await s.getFontName();
90 const fontSize = await s.getFontSize();
91 const serifOutput = ((await s.isSerif()) ? ' sans-serif; ' : ' ');
92 const returnString = ' style="font-family:' + fontName + '; font-size:' + fontSize + ';' + serifOutput + 'color:#' + rgbHex + ';"';
93 return returnString;
94 };
95
96 const main = async () => {
97 // eslint-disable-next-line no-unused-vars
98 let ret = 0;
99
100 // Relative path to the folder containing test files.
101 const inputPath = '../TestFiles/';
102 const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
103
104 const example1Basic = false;
105 const example2XML = false;
106 const example3Wordlist = false;
107 const example4Advanced = true;
108 const example5LowLevel = false;
109
110 try {
111 await PDFNet.startDeallocateStack();
112 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
113 doc.initSecurityHandler();
114
115 const page = await doc.getPage(1);
116
117 if (page.id === '0') {
118 console.log('Page not found.');
119 return 1;
120 }
121
122 const txt = await PDFNet.TextExtractor.create();
123 txt.begin(page);
124
125 let text;
126 let line;
127 let word;
128
129 // Example 1. Get all text on the page in a single string.
130 // Words will be separated with space or new line characters.
131 if (example1Basic) {
132 const wordCount = await txt.getWordCount();
133 console.log('Word Count: ' + wordCount);
134 text = await txt.getAsText();
135 console.log('\n\n- GetAsText --------------------------');
136 console.log(text);
137 console.log('-----------------------------------------------------------');
138 }
139
140 // Example 2. Get XML logical structure for the page.
141 if (example2XML) {
142 text = await txt.getAsXML(PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info);
143 console.log('\n\n- GetAsXML --------------------------\n' + text);
144 console.log('-----------------------------------------------------------');
145 }
146
147 // Example 3. Extract words one by one.
148 if (example3Wordlist) {
149 line = await txt.getFirstLine();
150 for (; (await line.isValid()); line = (await line.getNextLine())) {
151 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
152 text = await word.getString();
153 console.log(text);
154 }
155 }
156 console.log('-----------------------------------------------------------');
157 }
158
159 // Example 4. A more advanced text extraction example.
160 // The output is XML structure containing paragraphs, lines, words,
161 // as well as style and positioning information.
162 if (example4Advanced) {
163 let b;
164 let q;
165 let curFlowID = -1;
166 let curParaID = -1;
167
168 console.log('<PDFText>');
169
170 // For each line on the page...
171 for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
172 if ((await line.getNumWords()) === 0) {
173 continue;
174 }
175 if (curFlowID !== await line.getFlowID()) {
176 if (curFlowID !== -1) {
177 if (curParaID !== -1) {
178 curParaID = -1;
179 console.log('</Para>');
180 }
181 console.log('</Flow>');
182 }
183 curFlowID = await line.getFlowID();
184 console.log('<Flow id="' + curFlowID + '">');
185 }
186 if (curParaID !== await line.getParagraphID()) {
187 if (curParaID !== -1) {
188 console.log('</Para>');
189 }
190 curParaID = await line.getParagraphID();
191 console.log('<Para id="' + curParaID + '">');
192 }
193 b = await line.getBBox();
194 const lineStyle = await line.getStyle();
195 let outputStringLineBox = '<Line box="' + b.x1.toFixed(2) + ', ' + b.y1.toFixed(2) + ', ' + b.x2.toFixed(2) + ', ' + b.y2.toFixed(2) + '"';
196 outputStringLineBox += (await printStyle(lineStyle));
197 const currentLineNum = await line.getCurrentNum();
198 outputStringLineBox += ' cur_num="' + currentLineNum + '">';
199 console.log(outputStringLineBox);
200
201 // For each word in the line...
202 for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
203 // output bounding box for the word
204 q = await word.getBBox();
205 const currentNum = await word.getCurrentNum();
206 let outputStringWord = '<Word box="' + q.x1.toFixed(2) + ', ' + q.y1.toFixed(2) + ', ' + q.x2.toFixed(2) + ', ' + q.y2.toFixed(2) + '" cur_num="' + currentNum + '"';
207 const sz = await word.getStringLen();
208 if (sz === 0) {
209 continue;
210 }
211 // if the word style is different from the parent style, output the new style
212 const sty = await word.getStyle();
213 if (!(await sty.compare(lineStyle))) {
214 outputStringWord += await printStyle(sty);
215 }
216 outputStringWord += '>' + (await word.getString()) + '</Word>';
217 console.log(outputStringWord);
218 }
219 console.log('</Line>');
220 }
221 if (curFlowID !== -1) {
222 if (curParaID !== -1) {
223 curParaID = -1;
224 console.log('</Para>');
225 }
226 console.log('</Flow>');
227 }
228 console.log('</PDFText>');
229 }
230 await PDFNet.endDeallocateStack();
231 } catch (err) {
232 console.log(err);
233 console.log(err.stack);
234 ret = 1;
235 }
236
237
238 if (example5LowLevel) {
239 ret = 0;
240 try {
241 await PDFNet.startDeallocateStack();
242 const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
243 doc.initSecurityHandler();
244
245 // Example 1. Extract all text content from the document
246 const reader = await PDFNet.ElementReader.create();
247 const itr = await doc.getPageIterator(1);
248
249 // Read every page
250 for (itr; await itr.hasNext(); itr.next()) {
251 const page = await itr.current();
252 reader.beginOnPage(page);
253 await dumpAllText(reader);
254 reader.end();
255 }
256 // Example 2. Extract text content based on the
257 // selection rectangle.
258 console.log('\n----------------------------------------------------');
259 console.log('Extract text based on the selection rectangle.');
260 console.log('----------------------------------------------------');
261
262
263 const firstPage = await (await doc.getPageIterator()).current();
264 let s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(27, 392, 563, 534)), reader);
265 console.log('\nField 1: ' + s1);
266
267 s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(28, 551, 106, 623)), reader);
268 console.log('Field 2: ' + s1);
269
270 s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(208, 550, 387, 621)), reader);
271 console.log('Field 3: ' + s1);
272
273 // ...
274 console.log('Done');
275 await PDFNet.endDeallocateStack();
276 } catch (err) {
277 console.log(err.stack);
278 ret = 1;
279 }
280 }
281 };
282 PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) { console.log('Error: ' + JSON.stringify(error)); }).then(function () { return PDFNet.shutdown(); });
283 };
284 exports.runTextExtractTest();
285})(exports);
286// eslint-disable-next-line spaced-comment
287//# sourceURL=TextExtractTest.js
1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/newsletter.pdf";
12
13//---------------------------------------------------------------------------------------
14// This sample illustrates the basic text extraction capabilities of PDFNet.
15//---------------------------------------------------------------------------------------
16
17// A utility method used to dump all text content in the browser.
18function DumpAllText($reader)
19{
20 while (($element = $reader->Next()) != NULL)
21 {
22 switch ($element->GetType())
23 {
24 case Element::e_text_begin:
25 echo nl2br("\n--> Text Block Begin\n");
26 break;
27 case Element::e_text_end:
28 echo nl2br("\n--> Text Block End\n");
29 break;
30 case Element::e_text:
31 {
32 $bbox = $element->GetBBox();
33 echo nl2br("\n--> BBox: ".$bbox->x1.", "
34 .$bbox->y1.", "
35 .$bbox->x2.", "
36 .$bbox->y2."\n");
37
38 $arr = $element->GetTextString();
39 echo nl2br($arr."\n");
40 }
41 break;
42 case Element::e_text_new_line:
43 echo nl2br("\n--> New Line\n");
44 break;
45 case Element::e_form: // Process form XObjects
46 $reader->FormBegin();
47 DumpAllText(reader);
48 $reader->End();
49 break;
50 }
51 }
52}
53
54// A helper method for ReadTextFromRect
55function RectTextSearch($reader, $pos)
56{
57 $srch_str = "";
58 while (($element = $reader->Next()) != null)
59 {
60 switch ($element->GetType())
61 {
62 case Element::e_text:
63 {
64 $bbox = $element->GetBBox();
65 if($bbox->IntersectRect($bbox, $pos))
66 {
67 $arr = $element->GetTextString();
68 $srch_str .= $arr;
69 $srch_str .= nl2br("\n");
70 }
71 break;
72 }
73 case Element::e_text_new_line:
74 {
75 break;
76 }
77 case Element::e_form: // Process form XObjects
78 {
79 $reader->FormBegin();
80 $srch_str .= RectTextSearch($reader, $pos);
81 $reader->End();
82 break;
83 }
84 }
85 }
86 return $srch_str;
87}
88
89// A utility method used to extract all text content from
90// a given selection rectangle. The rectangle coordinates are
91// expressed in PDF user/page coordinate system.
92function ReadTextFromRect($page, $pos, $reader)
93{
94 $reader->Begin($page);
95 $str = RectTextSearch($reader, $pos);
96 $reader->End();
97 return $str;
98}
99
100function PrintStyle($style)
101{
102 $text_color = $style->GetColor();
103 $tmp = sprintf("%02X%02X%02X;", $text_color[0], $text_color[1], $text_color[2]);
104 echo " style=\"font-family:".$style->GetFontName()."; "
105 ."font-size:".$style->GetFontSize().";"
106 .($style->IsSerif() ? " sans-serif; " : " ")
107 ."color:#".$tmp."\"";
108}
109
110function IsStyleEqual($style1, $style2)
111{
112 if($style1->GetFontName() == $style2->GetFontName() &&
113 $style1->GetFontSize() == $style1->GetFontSize() &&
114 !($style1->IsSerif() xor $style1->IsSerif()) &&
115 $style1->GetColor() == $style2->GetColor() ) {
116 return true;
117 }
118 return false;
119}
120//---------------------------------------------------------------------------------------
121
122 PDFNet::Initialize($LicenseKey);
123 PDFNet::GetSystemFontList(); // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
124
125 $example1_basic = false;
126 $example2_xml = false;
127 $example3_wordlist = false;
128 $example4_advanced = true;
129 $example5_low_level = false;
130
131 // Sample code showing how to use high-level text extraction APIs.
132
133 $doc = new PDFDoc($input_path);
134 $doc->InitSecurityHandler();
135
136 $page = $doc->GetPage(1);
137 if (!$page){
138 echo nl2br("Page not found.\n");
139 return;
140 }
141
142 $txt = new TextExtractor();
143 $txt->Begin($page); // Read the page.
144 // Other options you may want to consider...
145 // txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
146 // txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
147
148 // Example 1. Get all text on the page in a single string.
149 // Words will be separated with space or new line characters.
150 if ($example1_basic)
151 {
152 // Get the word count.
153 echo "Word Count: ".$txt->GetWordCount()."\n";
154
155 $text = $txt->GetAsText();
156 echo nl2br("\n\n- GetAsText --------------------------\n".$text."\n");
157 echo nl2br("-----------------------------------------------------------\n");
158 }
159
160 // Example 2. Get XML logical structure for the page.
161 if ($example2_xml)
162 {
163 $text = $txt->GetAsXML(TextExtractor::e_words_as_elements | TextExtractor::e_output_bbox | TextExtractor::e_output_style_info);
164 echo nl2br("\n\n- GetAsXML --------------------------\n".$text."\n");
165 echo nl2br("-----------------------------------------------------------\n");
166 }
167
168 // Example 3. Extract words one by one.
169 if ($example3_wordlist)
170 {
171 for ($line = $txt->GetFirstLine(); $line->IsValid(); $line=$line->GetNextLine()) {
172 for ($word=$line->GetFirstWord(); $word->IsValid(); $word=$word->GetNextWord()) {
173 echo nl2br($word->GetString()."\n");
174 }
175 }
176 echo nl2br("-----------------------------------------------------------\n");
177 }
178
179 // Example 4. A more advanced text extraction example.
180 // The output is XML structure containing paragraphs, lines, words,
181 // as well as style and positioning information.
182 if ($example4_advanced)
183 {
184 $cur_flow_id=-1;
185 $cur_para_id=-1;
186
187 echo nl2br("<PDFText>\n");
188 // For each line on the page...
189 for ($line=$txt->GetFirstLine(); $line->IsValid(); $line=$line->GetNextLine())
190 {
191 if ($line->GetNumWords() == 0) continue;
192
193 if ($cur_flow_id != $line->GetFlowID()) {
194 if ($cur_flow_id != -1) {
195 if ($cur_para_id != -1) {
196 $cur_para_id = -1;
197 echo nl2br("</Para>\n");
198 }
199 echo nl2br("</Flow>\n");
200 }
201 $cur_flow_id = $line->GetFlowID();
202 echo nl2br("<Flow id=\"".$cur_flow_id."\">\n");
203 }
204
205 if ($cur_para_id != $line->GetParagraphID()) {
206 if ($cur_para_id != -1)
207 echo nl2br("</Para>\n");
208 $cur_para_id = $line->GetParagraphID();
209 echo nl2br("<Para id=\"".$cur_para_id."\">\n");
210 }
211
212 $bbox1 = $line->GetBBox();
213 $line_style = $line->GetStyle();
214 printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", $bbox1->x1, $bbox1->y1, $bbox1->x2, $bbox1->y2);
215 PrintStyle($line_style);
216 echo " cur_num=\"".$line->GetCurrentNum()."\"";
217 echo nl2br(">\n");
218
219 // For each word in the line...
220 for ($word=$line->GetFirstWord(); $word->IsValid(); $word=$word->GetNextWord())
221 {
222 // Output the bounding box for the word.
223 $bbox2 = $word->GetBBox();
224 printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", $bbox2->x1, $bbox2->y1, $bbox2->x2, $bbox2->y2);
225 echo " cur_num=\"" .$word->GetCurrentNum()."\"";
226 $sz = $word->GetStringLen();
227 if ($sz == 0) continue;
228
229 // If the word style is different from the parent style, output the new style.
230 $s = $word->GetStyle();
231 if(!$s->IsEqual($line_style)){
232 PrintStyle($s);
233 }
234
235 echo ">".$word->GetString();
236 echo nl2br("</Word>\n");
237 }
238 echo nl2br("</Line>\n");
239 }
240
241 if ($cur_flow_id != -1) {
242 if ($cur_para_id != -1) {
243 $cur_para_id = -1;
244 echo nl2br("</Para>\n");
245 }
246 echo nl2br("</Flow>\n");
247
248
249 }
250 echo nl2br("</PDFText>\n");
251
252 $txt->Destroy();
253 $doc->Close();
254
255 }
256
257 if($example5_low_level)
258 {
259 $doc = new PDFDoc($input_path);
260 $doc->InitSecurityHandler();
261
262 // Example 1. Extract all text content from the document
263
264 $reader = new ElementReader();
265
266 // Read every page
267 for ($itr=$doc->GetPageIterator(); $itr->HasNext(); $itr->Next())
268 {
269 $reader->Begin($itr->Current());
270 DumpAllText($reader);
271 $reader->End();
272 }
273
274 // Example 2. Extract text content based on the
275 // selection rectangle.
276 echo nl2br("\n----------------------------------------------------");
277 echo nl2br("\nExtract text based on the selection rectangle.");
278 echo nl2br("\n----------------------------------------------------\n");
279
280 $first_page = $doc->GetPage(1);
281 $s1 = ReadTextFromRect($first_page, new Rect(27.0, 392.0, 563.0, 534.0), $reader);
282 echo nl2br("\nField 1: ".$s1);
283
284 $s1 = ReadTextFromRect($first_page, new Rect(28.0, 551.0, 106.0, 623.0), $reader);
285 echo nl2br("\nField 2: ".$s1);
286
287 $s1 = ReadTextFromRect($first_page, new Rect(208.0, 550.0, 387.0, 621.0), $reader);
288 echo nl2br("\nField 3: ".$s1);
289
290 // ...
291 $doc->Close();
292 echo nl2br("Done.\n");
293 }
294 PDFNet::Terminate();
295?>
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14
15def printStyle (style):
16 sans_serif_str = ""
17 if style.IsSerif():
18 sans_serif_str = " sans-serif;"
19 rgb = style.GetColor()
20 rgb_hex = "%02X%02X%02X;" % (rgb[0], rgb[1], rgb[2])
21 font_str = '%g' % style.GetFontSize()
22 sys.stdout.write(" style=\"font-family:" + style.GetFontName() + "; font-size:"
23 + font_str + ";" + sans_serif_str + " color:#" + rgb_hex + "\"")
24
25def dumpAllText (reader):
26 element = reader.Next()
27 while element != None:
28 type = element.GetType()
29 if type == Element.e_text_begin:
30 print("Text Block Begin")
31 elif type == Element.e_text_end:
32 print("Text Block End")
33 elif type == Element.e_text:
34 bbox = element.GetBBox()
35 print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", "
36 + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
37 textString = element.GetTextString()
38 print(textString)
39 elif type == Element.e_text_new_line:
40 print("New Line")
41 elif type == Element.e_form:
42 reader.FormBegin()
43 dumpAllText(reader)
44 reader.End()
45 element = reader.Next()
46
47# A utility method used to extract all text content from
48# a given selection rectangle. The recnagle coordinates are
49# expressed in PDF user/page coordinate system.
50def ReadTextFromRect (page, pos, reader):
51 reader.Begin(page)
52 srch_str = RectTextSearch(reader, pos)
53 reader.End()
54 return srch_str
55
56#A helper method for ReadTextFromRect
57def RectTextSearch (reader, pos):
58 element = reader.Next()
59 srch_str2 = ""
60 while element != None:
61 type = element.GetType()
62 if type == Element.e_text:
63 bbox = element.GetBBox()
64 if (bbox.IntersectRect(bbox, pos)):
65 arr = element.GetTextString()
66 srch_str2 += arr
67 srch_str2 += "\n"
68 elif type == Element.e_text_new_line:
69 None
70 elif type == Element.e_form:
71 reader.FormBegin()
72 srch_str2 += RectTextSearch(reader, pos)
73 print(srch_str2)
74 reader.End()
75 element = reader.Next()
76 return srch_str2
77
78
79def main():
80 PDFNet.Initialize(LicenseKey)
81
82 # Relative path to the folder containing test files.
83 input_path = "../../TestFiles/newsletter.pdf"
84 example1_basic = False
85 example2_xml = False
86 example3_wordlist = False
87 example4_advanced = True
88 example5_low_level = False
89
90 # Sample code showing how to use high-level text extraction APIs.
91 doc = PDFDoc(input_path)
92 doc.InitSecurityHandler()
93
94 page = doc.GetPage(1)
95 if page == None:
96 print("page no found")
97
98 txt = TextExtractor()
99 txt.Begin(page) # Read the page
100
101 # Example 1. Get all text on the page in a single string.
102 # Words will be separated witht space or new line characters.
103 if example1_basic:
104 print("Word count: " + str(txt.GetWordCount()))
105 txtAsText = txt.GetAsText()
106 print("- GetAsText --------------------------" + txtAsText)
107 print("-----------------------------------------------------------")
108
109 # Example 2. Get XML logical structure for the page.
110 if example2_xml:
111 text = txt.GetAsXML(TextExtractor.e_words_as_elements |
112 TextExtractor.e_output_bbox |
113 TextExtractor.e_output_style_info)
114 print("- GetAsXML --------------------------" + text)
115 print("-----------------------------------------------------------")
116
117 # Example 3. Extract words one by one.
118 if example3_wordlist:
119 word = Word()
120 line = txt.GetFirstLine()
121 while line.IsValid():
122 word = line.GetFirstWord()
123 while word.IsValid():
124 wordString = word.GetString()
125 print(wordString)
126 word = word.GetNextWord()
127 line = line.GetNextLine()
128 print("-----------------------------------------------------------")
129
130 # Example 4. A more advanced text extraction example.
131 # The output is XML structure containing paragraphs, lines, words,
132 # as well as style and positioning information.
133 if example4_advanced:
134 bbox = Rect();
135 cur_flow_id = -1
136 cur_para_id = -1
137
138 print("<PDFText>")
139 # For each line on the page...
140 line = txt.GetFirstLine()
141 while line.IsValid():
142 if line.GetNumWords() == 0:
143 line = line.GetNextLine()
144 continue
145 word = line.GetFirstWord()
146 if cur_flow_id != line.GetFlowID():
147 if cur_flow_id != -1:
148 if cur_para_id != -1:
149 cur_para_id = -1;
150 print("</Para>")
151 print("</Flow>")
152 cur_flow_id = line.GetFlowID()
153 print("<Flow id=\"" + str(cur_flow_id) +"\">")
154
155 if cur_para_id != line.GetParagraphID():
156 if cur_para_id != -1:
157 print("</Para>")
158 cur_para_id= line.GetParagraphID()
159 print("<Para id=\"" +str(cur_para_id)+ "\">")
160
161 bbox = line.GetBBox()
162 line_style = line.GetStyle()
163 sys.stdout.write("<Line box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
164 printStyle (line_style)
165 sys.stdout.write(" cur_num=\"" + str(line.GetCurrentNum()) + "\"" + ">\n")
166
167 # For each word in the line...
168 word = line.GetFirstWord()
169 while word.IsValid():
170 # Output the bounding box for the word
171 bbox = word.GetBBox()
172 sys.stdout.write("<Word box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
173 sys.stdout.write(" cur_num=\"" + str(word.GetCurrentNum()) + "\"");
174 sz = word.GetStringLen()
175 if sz == 0:
176 word = word.GetNextWord()
177 continue
178 # If the word style is different from the parent style, output the new style.
179 s = word.GetStyle()
180 if s != line_style:
181 printStyle (s);
182 wordString = word.GetString()
183 sys.stdout.write(">" + wordString + "</Word>\n")
184 word = word.GetNextWord()
185 sys.stdout.write("</Line>\n")
186 line = line.GetNextLine()
187
188 if cur_flow_id != -1:
189 if cur_para_id != -1:
190 cur_para_id = -1
191 sys.stdout.write("</Para>\n")
192 sys.stdout.write("</Flow>\n")
193
194 txt.Destroy()
195 doc.Close()
196 print("</PDFText>")
197
198 # Sample code showing how to use low-level text extraction APIs.
199 if example5_low_level:
200 doc = PDFDoc(input_path)
201 doc.InitSecurityHandler()
202
203 # Example 1. Extract all text content from the document
204
205 reader = ElementReader()
206 itr = doc.GetPageIterator()
207 while itr.HasNext():
208 reader.Begin(itr.Current())
209 dumpAllText(reader)
210 reader.End()
211 itr.Next()
212
213 # Example 2. Extract text content based on the
214 # selection rectangle.
215
216 print("----------------------------------------------------")
217 print("Extract text based on the selection rectangle.")
218 print("----------------------------------------------------")
219
220 itr = doc.GetPageIterator()
221 first_page = itr.Current()
222 s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader)
223 print("Field 1: " + s1)
224
225 s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
226 print("Field 2: " + s1)
227
228 s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
229 print("Field 3: " + s1)
230
231 doc.Close()
232
233 print("Done.")
234 PDFNet.Terminate()
235
236if __name__ == '__main__':
237 main()
1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12def PrintStyle (style)
13 sans_serif_str = ""
14 if style.IsSerif()
15 sans_serif_str = " sans-serif;"
16 end
17 rgb = style.GetColor
18 rgb_hex = "%02X%02X%02X;" % [rgb[0], rgb[1], rgb[2]]
19 font_str = '%g' % style.GetFontSize
20 print " style=\"font-family:" + style.GetFontName + "; font-size:" + font_str + ";" + sans_serif_str + " color:#" + rgb_hex + "\""
21end
22
23def DumpAllText (reader)
24 element = reader.Next
25 while !element.nil? do
26 case element.GetType
27 when Element::E_text_begin
28 puts "Text Block Begin"
29 when Element::E_text_end
30 puts "Text Block End"
31 when Element::E_text
32 bbox = element.GetBBox
33 puts "BBox: " + bbox.GetX1.to_s + ", " + bbox.GetY1.to_s + ", " +
34 bbox.GetX2.to_s + ", " + bbox.GetY2.to_s
35 puts element.GetTextString
36 when Element::E_text_new_line
37 puts "New Line"
38 when Element::E_form
39 reader.FormBegin
40 DumpAllText(reader)
41 reader.End
42 end
43 element = reader.Next
44 end
45end
46
47# A utility method used to extract all text content from
48# a given selection rectangle. The recnagle coordinates are
49# expressed in PDF user/page coordinate system.
50def ReadTextFromRect (page, pos, reader)
51 reader.Begin(page)
52 srch_str = RectTextSearch(reader, pos)
53 reader.End
54 return srch_str
55end
56
57#A helper method for ReadTextFromRect
58def RectTextSearch (reader, pos)
59 element = reader.Next
60 srch_str2 = ""
61 while !element.nil? do
62 case element.GetType
63 when Element::E_text
64 bbox = element.GetBBox
65 if bbox.IntersectRect(bbox, pos)
66 arr = element.GetTextString
67 srch_str2 += arr
68 srch_str2 += "\n"
69 end
70 when Element::E_text_new_line
71 when Element::E_form
72 reader.FormBegin
73 srch_str2 += RectTextSearch(reader, pos)
74 puts srch_str2
75 reader.End
76 end
77 element = reader.Next
78 end
79 return srch_str2
80end
81
82 PDFNet.Initialize(PDFTronLicense.Key)
83
84 # Relative path to the folder containing test files.
85 input_path = "../../TestFiles/newsletter.pdf"
86 example1_basic = false
87 example2_xml = false
88 example3_wordlist = false
89 example4_advanced = true
90 example5_low_level = false
91
92 # Sample code showing how to use high-level text extraction APIs.
93 doc = PDFDoc.new(input_path)
94 doc.InitSecurityHandler
95
96 page = doc.GetPage(1)
97 if page.nil?
98 print("page no found")
99 end
100
101 txt = TextExtractor.new
102 txt.Begin(page) # Read the page
103
104 # Example 1. Get all text on the page in a single string.
105 # Words will be separated witht space or new line characters.
106 if example1_basic
107 puts "Word count: " + txt.GetWordCount.to_s
108 puts "- GetAsText --------------------------" + txt.GetAsText
109 puts "-----------------------------------------------------------"
110 end
111
112 # Example 2. Get XML logical structure for the page.
113 if example2_xml
114 text = txt.GetAsXML(TextExtractor::E_words_as_elements |
115 TextExtractor::E_output_bbox |
116 TextExtractor::E_output_style_info)
117 puts "- GetAsXML --------------------------" + text
118 puts "-----------------------------------------------------------"
119 end
120
121
122
123 # Example 3. Extract words one by one.
124 if example3_wordlist
125 word = Word.new
126 line = txt.GetFirstLine
127 while line.IsValid do
128 word = line.GetFirstWord
129 while word.IsValid do
130 puts word.GetString
131 word = word.GetNextWord
132 end
133 line = line.GetNextLine
134 end
135 puts "-----------------------------------------------------------"
136 end
137
138
139 # Example 4. A more advanced text extraction example.
140 # The output is XML structure containing paragraphs, lines, words,
141 # as well as style and positioning information.
142 if example4_advanced
143 bbox = Rect.new
144 cur_flow_id = -1
145 cur_para_id = -1
146
147 puts "<PDFText>"
148 # For each line on the page...
149 line = txt.GetFirstLine
150 while line.IsValid do
151 word_num = line.GetNumWords
152 if word_num == 0
153 line = line.GetNextLine
154 next
155 end
156 word = line.GetFirstWord
157 if cur_flow_id != line.GetFlowID
158 if cur_flow_id != -1
159 if cur_para_id != -1
160 cur_para_id = -1
161 puts "</Para>"
162 end
163 puts "</Flow>"
164 end
165 cur_flow_id = line.GetFlowID
166 puts "<Flow id=\"" + cur_flow_id.to_s + "\">"
167 end
168
169 if cur_para_id != line.GetParagraphID
170 if cur_para_id != -1
171 puts "</Para>"
172 end
173 cur_para_id= line.GetParagraphID
174 puts "<Para id=\"" + cur_para_id.to_s + "\">"
175 end
176
177 bbox = line.GetBBox
178 line_style = line.GetStyle
179 print "<Line box=\"%.2f, %.2f, %.2f, %.2f\""% [bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()]
180 PrintStyle (line_style)
181 print " cur_num=\"" + "%d" % line.GetCurrentNum + "\"" + ">\n"
182
183 # For each word in the line...
184 word = line.GetFirstWord
185 while word.IsValid do
186 # Output the bounding box for the word
187 bbox = word.GetBBox
188 print "<Word box=\"%.2f, %.2f, %.2f, %.2f\""% [bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()]
189 print " cur_num=\"" + "%d" % word.GetCurrentNum + "\"";
190 sz = word.GetStringLen
191 if sz == 0
192 word = word.GetNextWord
193 next
194 end
195 # If the word style is different from the parent style, output the new style.
196 s = word.GetStyle
197 if s != line_style
198 PrintStyle (s)
199 end
200 print ">" + word.GetString + "</Word>\n"
201 word = word.GetNextWord
202 end
203 puts "</Line>"
204 line = line.GetNextLine
205 end
206
207 if cur_flow_id != -1
208 if cur_para_id != -1
209 cur_para_id = -1
210 puts "</Para>"
211 end
212 puts "</Flow>"
213 end
214
215 txt.Destroy
216 doc.Close
217 puts "</PDFText>"
218 end
219
220 # Sample code showing how to use low-level text extraction APIs.
221 if example5_low_level
222 doc = PDFDoc.new(input_path)
223 doc.InitSecurityHandler
224
225 # Example 1. Extract all text content from the document
226
227 reader = ElementReader.new
228 itr = doc.GetPageIterator
229 while itr.HasNext do
230 reader.Begin(itr.Current)
231 DumpAllText(reader)
232 reader.End
233 itr.Next
234 end
235
236 # Example 2. Extract text content based on the
237 # selection rectangle.
238
239 puts "----------------------------------------------------"
240 puts "Extract text based on the selection rectangle."
241 puts "----------------------------------------------------"
242
243 itr = doc.GetPageIterator
244 first_page = itr.Current
245 s1 = ReadTextFromRect(first_page, Rect.new(27, 392, 563, 534), reader)
246 puts "Field 1: " + s1
247
248 s1 = ReadTextFromRect(first_page, Rect.new(28, 551, 106, 623), reader);
249 puts "Field 2: " + s1
250
251 s1 = ReadTextFromRect(first_page, Rect.new(208, 550, 387, 621), reader);
252 puts "Field 3: " + s1
253
254 doc.Close
255 puts "Done."
256 end
257 PDFNet.Terminate
1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports System
6Imports System.Drawing
7Imports pdftron
8Imports pdftron.Common
9Imports pdftron.Filters
10Imports pdftron.SDF
11Imports PDFTRON.PDF
12
13' This sample illustrates various text extraction capabilities of PDFNet.
14
15Module TextExtractTestVB
16 Dim pdfNetLoader As PDFNetLoader
17 Sub New()
18 pdfNetLoader = pdftron.PDFNetLoader.Instance()
19 End Sub
20
21 Sub Main()
22
23 PDFNet.Initialize(PDFTronLicense.Key)
24
25 ' Relative path to the folder containing test files.
26 Dim input_path As String = "../../../../TestFiles/"
27
28 Dim example1_basic As Boolean = False
29 Dim example2_xml As Boolean = False
30 Dim example3_wordlist As Boolean = False
31 Dim example4_advanced As Boolean = True
32 Dim example5_low_level As Boolean = False
33
34 ' Sample code showing how to use high-level text extraction APIs.
35 Try
36 Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
37 doc.InitSecurityHandler()
38
39 Dim pg As Page = doc.GetPage(1)
40 If pg Is Nothing Then
41 Console.WriteLine("Page not found.")
42 Return
43 End If
44
45 Using txt As TextExtractor = New TextExtractor
46 txt.Begin(pg) ' Read the page.
47 ' Other options you may want to consider...
48 ' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_no_dup_remove)
49 ' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_remove_hidden_text)
50 ' ...
51
52 ' Example 1. Get all text on the page in a single string.
53 ' Words will be separated with space or new line characters.
54 If example1_basic Then
55 ' Get the word count.
56 Console.WriteLine("Word Count: {0}", txt.GetWordCount())
57
58 Console.WriteLine("")
59 Console.WriteLine("- GetAsText --------------------------")
60 Console.WriteLine(txt.GetAsText())
61 Console.WriteLine("-----------------------------------------------------------")
62 End If
63
64
65 ' Example 2. Get XML logical structure for the page.
66 If example2_xml Then
67 Console.WriteLine("")
68 Console.WriteLine("- GetAsXML --------------------------")
69 Console.WriteLine(txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements Or TextExtractor.XMLOutputFlags.e_output_bbox Or TextExtractor.XMLOutputFlags.e_output_style_info))
70 Console.WriteLine("-----------------------------------------------------------")
71 End If
72
73
74 If example3_wordlist Then
75 Dim word As TextExtractor.Word
76 Dim line As TextExtractor.Line = txt.GetFirstLine()
77 While line.IsValid()
78 word = line.GetFirstWord()
79 While word.IsValid()
80 Console.WriteLine(word.GetString())
81 word = word.GetNextWord()
82 End While
83 line = line.GetNextLine()
84 End While
85 Console.WriteLine("-----------------------------------------------------------")
86 End If
87
88
89 ' Example 3. A more advanced text extraction example.
90 ' The output is XML structure containing paragraphs, lines, words,
91 ' as well as style and positioning information.
92 If example4_advanced Then
93 Dim bbox As Rect
94 Dim cur_flow_id As Integer = -1
95 Dim cur_para_id As Integer = -1
96
97 Dim line As TextExtractor.Line
98 Dim word As TextExtractor.Word
99 Dim s As TextExtractor.Style
100 Dim line_style As TextExtractor.Style
101 Console.WriteLine("<PDFText>")
102 ' For each line on the page...
103 line = txt.GetFirstLine()
104
105 While line.IsValid()
106 If Not cur_flow_id = line.GetFlowID() Then
107 If Not cur_flow_id = -1 Then
108 If Not cur_para_id = -1 Then
109 cur_para_id = -1
110 Console.WriteLine("</Para>")
111 End If
112 Console.WriteLine("</Flow>")
113 End If
114 cur_flow_id = line.GetFlowID()
115 Console.WriteLine("<Flow id=""{0}"">", cur_flow_id)
116 End If
117
118 If Not cur_para_id = line.GetParagraphID() Then
119 If Not cur_para_id = -1 Then
120 Console.WriteLine("</Para>")
121 End If
122 cur_para_id = line.GetParagraphID()
123 Console.WriteLine("<Para id=""{0}"">", cur_para_id)
124 End If
125
126 bbox = line.GetBBox()
127 line_style = line.GetStyle()
128 Console.Write("<Line box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
129 PrintStyle(line_style)
130 Console.Write(" cur_num=""" & line.GetCurrentNum() & """")
131 Console.WriteLine(">")
132
133 ' For each word in the line...
134 word = line.GetFirstWord()
135 While word.IsValid()
136 ' Output the bounding box for the word.
137 bbox = word.GetBBox()
138 Console.Write("<Word box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
139 Console.Write(" cur_num=""" & word.GetCurrentNum() & """")
140 Dim sz As Integer = word.GetStringLen()
141 If (sz = 0) Then Continue While
142 ' If the word style is different from the parent style, output the new style.
143 s = word.GetStyle()
144 If Not s.Equals(line_style) Then
145 PrintStyle(s)
146 End If
147
148 Console.Write(">")
149 Console.Write(word.GetString())
150 Console.WriteLine("</Word>")
151 word = word.GetNextWord()
152 End While
153
154 Console.WriteLine("</Line>")
155 line = line.GetNextLine()
156 End While
157
158 If Not cur_flow_id = -1 Then
159 If Not cur_para_id = -1 Then
160 cur_para_id = -1
161 Console.WriteLine("</Para>")
162 End If
163 Console.WriteLine("</Flow>")
164 End If
165 End If
166
167 Console.WriteLine("</PDFText>")
168 End Using
169 End Using
170 Catch ex As PDFNetException
171 Console.WriteLine(ex.Message)
172 Catch ex As Exception
173 MsgBox(ex.Message)
174 End Try
175
176
177
178 ' Sample code showing how to use low-level text extraction APIs.
179 If (example5_low_level) Then
180
181 Try
182 ' Open the test file
183 Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
184 doc.InitSecurityHandler()
185
186 Using reader As ElementReader = New ElementReader
187
188 ' Example 1. Extract all text content from the document
189 Dim itr As PageIterator = doc.GetPageIterator()
190 ' While itr.HasNext()
191 reader.Begin(itr.Current())
192 DumpAllText(reader)
193 reader.End()
194 ' itr.Next()
195 ' End While
196
197 ' Example 2. Extract text based on the selection rectangle.
198 Console.WriteLine("----------------------------------------------------")
199 Console.WriteLine("Extract text based on the selection rectangle.")
200 Console.WriteLine("----------------------------------------------------")
201
202 Dim first_page As Page = doc.GetPage(1)
203 Dim field1 As String = ReadTextFromRect(first_page, New Rect(27, 392, 563, 534), reader)
204 Dim field2 As String = ReadTextFromRect(first_page, New Rect(28, 551, 106, 623), reader)
205 Dim field3 As String = ReadTextFromRect(first_page, New Rect(208, 550, 387, 621), reader)
206
207 Console.WriteLine("Field 1: {0}", field1)
208 Console.WriteLine("Field 2: {0}", field2)
209 Console.WriteLine("Field 3: {0}", field3)
210 ' ...
211
212 Console.WriteLine("Done.")
213 End Using
214 End Using
215
216 Catch ex As PDFNetException
217 Console.WriteLine(ex.Message)
218 Catch ex As Exception
219 MsgBox(ex.Message)
220 End Try
221 End If
222 PDFNet.Terminate()
223 End Sub
224
225
226 Sub PrintStyle(ByRef s As TextExtractor.Style)
227 Dim RGB As Color = s.GetColor()
228 Dim rgb_hex As String = String.Format("{0:X02}{1:X02}{2:X02};", RGB.R, RGB.G, RGB.B)
229 Dim sans_serif_str As String = ""
230 If s.IsSerif() Then
231 sans_serif_str = " sans-serif;"
232 End If
233 Console.Write(" style=""font-family:{0}; font-size:{1};{2} color:#{3}""", s.GetFontName(), s.GetFontSize(), sans_serif_str, rgb_hex)
234 End Sub
235
236 ' LowLevelTextExtractUtils ----------------------------------------
237
238 Sub DumpAllText(ByRef reader As ElementReader)
239 Dim element As Element = reader.Next()
240 While (Not IsNothing(element)) ' Read page contents
241 Dim type As Element.Type = element.GetType()
242
243 If type = element.Type.e_text_begin Then
244 Console.WriteLine()
245 Console.WriteLine("--> Text Block Begin")
246 ElseIf type = element.Type.e_text_end Then
247 Console.WriteLine()
248 Console.WriteLine("--> Text Block End")
249 ElseIf type = element.Type.e_text Then
250 Dim bbox As Rect = New Rect
251 element.GetBBox(bbox)
252 ' Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2)
253
254 Dim txt As String = element.GetTextString()
255 Console.WriteLine(txt)
256 ElseIf type = element.Type.e_text_new_line Then
257 ' Console.WriteLine()
258 ' Console.WriteLine("--> New Line")
259 ElseIf type = element.Type.e_form Then
260 reader.FormBegin() ' Process form XObjects
261 DumpAllText(reader)
262 reader.End()
263 End If
264
265 element = reader.Next()
266 End While
267 End Sub
268
269 Private _srch_str As String
270
271 ' A helper method for ReadTextFromRect
272 Sub RectTextSearch(ByRef reader As ElementReader, ByRef pos As Rect)
273 Dim element As Element = reader.Next()
274 While (Not IsNothing(element)) ' Read page contents
275 Dim type As Element.Type = element.GetType()
276
277 If type = element.Type.e_text Then
278 Dim bbox As Rect = New Rect
279 element.GetBBox(bbox)
280
281 If (bbox.IntersectRect(bbox, pos)) Then
282 Dim txt As String = element.GetTextString()
283 _srch_str = _srch_str + txt
284 End If
285 ElseIf type = element.Type.e_text_new_line Then
286 ElseIf type = element.Type.e_form Then
287 reader.FormBegin() ' Process form XObjects
288 RectTextSearch(reader, pos)
289 reader.End()
290 End If
291
292 element = reader.Next()
293 End While
294 End Sub
295
296
297 ' A utility method used to extract all text content from
298 ' a given selection rectangle. The rectangle coordinates are
299 ' expressed in PDF user/page coordinate system.
300 Function ReadTextFromRect(ByRef page As Page, ByRef pos As Rect, ByRef reader As ElementReader) As String
301 _srch_str = ""
302 reader.Begin(page)
303 RectTextSearch(reader, pos)
304 reader.End()
305 Return _srch_str
306 End Function
307
308End Module
Did you find this helpful?
Trial setup questions?
Ask experts on DiscordNeed other help?
Contact SupportPricing or product questions?
Contact Sales