Extract Text, Read, Parse PDF - TextExtract - Python Sample Code

Sample code for using Apryse SDK to read a PDF (parse and extract text), provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby, Go and VB. If you'd like to search text on PDF pages, see our code sample for text search.

Learn more about our full PDF Data Extraction SDK Capabilities.

To start your free trial, get stated with Server SDK.

1//
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3//
4
5using System;
6using System.Drawing;
7using pdftron;
8using pdftron.Common;
9using pdftron.Filters;
10using pdftron.SDF;
11using pdftron.PDF;
12
13
14namespace TextExtractTestCS
15{
16	// This sample illustrates various text extraction capabilities of PDFNet.
17
18	class Class1
19	{		
20		private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();
21		static Class1() {}
22		
23		static void Main(string[] args)
24		{
25			PDFNet.Initialize(PDFTronLicense.Key);
26
27			// Relative path to the folder containing test files.
28			string input_path =  "../../../../TestFiles/";
29
30			bool example1_basic     = false;
31			bool example2_xml       = false;
32			bool example3_wordlist  = false;
33			bool example4_advanced  = true;
34			bool example5_low_level = false;
35
36			// Sample code showing how to use high-level text extraction APIs.
37			try	
38			{
39				using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
40				{
41					doc.InitSecurityHandler();
42
43					Page page = doc.GetPage(1);
44					if (page == null) {
45						Console.WriteLine("Page not found.");
46						return;
47					}
48
49					using (TextExtractor txt = new TextExtractor())
50					{
51						txt.Begin(page);  // Read the page.
52						// Other options you may want to consider...
53						// txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
54						// txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
55						// ...
56
57						// Example 1. Get all text on the page in a single string.
58						// Words will be separated with space or new line characters.
59						if (example1_basic) 
60						{
61							// Get the word count.
62							Console.WriteLine("Word Count: {0}", txt.GetWordCount());
63						
64							Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
65							Console.WriteLine("-----------------------------------------------------------");
66						}
67
68						// Example 2. Get XML logical structure for the page.
69						if (example2_xml) 
70						{
71							String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
72							Console.WriteLine("\n\n- GetAsXML  --------------------------\n{0}", text);
73							Console.WriteLine("-----------------------------------------------------------");
74						}
75
76						// Example 3. Extract words one by one.
77						if (example3_wordlist) 
78						{
79							TextExtractor.Word word;
80							for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())	
81							{
82								for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord()) 
83								{
84									Console.WriteLine(word.GetString());
85								}
86							}
87							Console.WriteLine("-----------------------------------------------------------");
88						}
89
90						// Example 3. A more advanced text extraction example. 
91						// The output is XML structure containing paragraphs, lines, words, 
92						// as well as style and positioning information.
93						if (example4_advanced) 
94						{
95							Rect bbox;
96							int cur_flow_id=-1, cur_para_id=-1;
97
98							TextExtractor.Line line;
99							TextExtractor.Word word;
100							TextExtractor.Style s, line_style;
101
102							Console.WriteLine("<PDFText>");
103							// For each line on the page...
104							for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
105							{
106								if (line.GetNumWords() == 0)
107								{
108									continue;
109								}
110
111								if (cur_flow_id != line.GetFlowID()) {
112									if (cur_flow_id != -1) {
113										if (cur_para_id != -1) {
114											cur_para_id = -1;
115											Console.WriteLine("</Para>");
116										}
117										Console.WriteLine("</Flow>");
118									}
119									cur_flow_id = line.GetFlowID();
120									Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
121								}
122
123								if (cur_para_id != line.GetParagraphID()) {
124									if (cur_para_id != -1)
125										Console.WriteLine("</Para>");
126									cur_para_id = line.GetParagraphID();
127									Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
128								}	
129
130								bbox = line.GetBBox();
131								line_style = line.GetStyle();
132								Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
133								PrintStyle(line_style);
134								Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");
135
136								// For each word in the line...
137								for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
138								{
139									// Output the bounding box for the word.
140									bbox = word.GetBBox();
141									Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
142									Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
143									int sz = word.GetStringLen();
144									if (sz == 0) continue;
145
146									// If the word style is different from the parent style, output the new style.
147									s = word.GetStyle();
148									if (s != line_style) {
149										PrintStyle(s);
150									}
151
152									Console.Write(">{0}", word.GetString());
153									Console.WriteLine("</Word>");
154								}
155								Console.WriteLine("</Line>");
156							}
157
158							if (cur_flow_id != -1) {
159								if (cur_para_id != -1) {
160									cur_para_id = -1;
161									Console.WriteLine("</Para>");
162								}
163								Console.WriteLine("</Flow>");
164							}
165						}
166
167					}
168					Console.WriteLine("</PDFText>");
169				}
170			}
171			catch (PDFNetException e)
172			{
173				Console.WriteLine(e.Message);
174			}
175
176			// Sample code showing how to use low-level text extraction APIs.
177			if (example5_low_level)
178			{
179				try	
180				{
181					LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
182					using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
183					{
184						doc.InitSecurityHandler();
185
186						// Example 1. Extract all text content from the document
187						using (ElementReader reader = new ElementReader())
188						{
189							PageIterator itr = doc.GetPageIterator();
190							//for (; itr.HasNext(); itr.Next()) //  Read every page
191							{				
192								reader.Begin(itr.Current());
193								LowLevelTextExtractUtils.DumpAllText(reader);
194								reader.End();
195							}
196
197							// Example 2. Extract text based on the selection rectangle.
198							Console.WriteLine("----------------------------------------------------");
199							Console.WriteLine("Extract text based on the selection rectangle.");
200							Console.WriteLine("----------------------------------------------------");
201
202							Page first_page = doc.GetPage(1);
203							string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
204							string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
205							string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
206
207							Console.WriteLine("Field 1: {0}", field1);
208							Console.WriteLine("Field 2: {0}", field2);
209							Console.WriteLine("Field 3: {0}", field3);
210							// ... 
211
212							Console.WriteLine("Done.");
213						}
214					}
215				}
216				catch (PDFNetException e)
217				{
218					Console.WriteLine(e.Message);
219				}	
220			}
221			PDFNet.Terminate();
222		}
223
224		static void PrintStyle(TextExtractor.Style s) {
225			Color rgb = s.GetColor();
226			String rgb_hex = String.Format("{0:X02}{1:X02}{2:X02};", rgb.R, rgb.G, rgb.B);
227			Console.Write(" style=\"font-family:{0}; font-size:{1};{2} color:#{3}\"", s.GetFontName(), s.GetFontSize(), (s.IsSerif() ? " sans-serif;" : ""), rgb_hex); 
228		}
229	}
230
231	class LowLevelTextExtractUtils
232	{
233		// A utility method used to dump all text content in the 
234		// console window.
235		public static void DumpAllText(ElementReader reader) 
236		{
237			Element element; 
238			while ((element = reader.Next()) != null)
239			{
240				switch (element.GetType()) 
241				{
242					case Element.Type.e_text_begin:
243						Console.WriteLine("\n--> Text Block Begin");
244						break;
245					case Element.Type.e_text_end:
246						Console.WriteLine("\n--> Text Block End");
247						break;
248					case Element.Type.e_text:
249					{
250						Rect bbox = new Rect();
251						element.GetBBox(bbox);
252						// Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
253
254						String txt = element.GetTextString();
255						Console.Write(txt);
256						Console.WriteLine("");
257						break;
258					}
259					case Element.Type.e_text_new_line:
260					{
261						// Console.WriteLine("\n--> New Line");
262						break;
263					}
264					case Element.Type.e_form: // Process form XObjects
265					{
266						reader.FormBegin(); 
267						DumpAllText(reader);
268						reader.End(); 
269						break; 
270					}
271				}
272			}
273		}
274
275
276		private string _srch_str;
277
278		// A helper method for ReadTextFromRect
279		void RectTextSearch(ElementReader reader, Rect pos) 
280		{			
281			Element element; 
282			while ((element = reader.Next()) != null)
283			{
284				switch (element.GetType()) 
285				{
286					case Element.Type.e_text:
287					{
288						Rect bbox = new Rect();
289						element.GetBBox(bbox);
290						if(bbox.IntersectRect(bbox, pos))
291						{
292							_srch_str += element.GetTextString();
293							_srch_str += "\n"; // add a new line?
294						}
295						break;
296					}
297					case Element.Type.e_text_new_line:
298					{
299						break;
300					}
301					case Element.Type.e_form: // Process form XObjects
302					{
303						reader.FormBegin(); 
304						RectTextSearch(reader, pos);
305						reader.End(); 
306						break; 
307					}
308				}
309			}
310		}
311
312		// A utility method used to extract all text content from
313		// a given selection rectangle. The rectangle coordinates are
314		// expressed in PDF user/page coordinate system.
315		public string ReadTextFromRect(Page page, Rect pos, ElementReader reader)
316		{
317			_srch_str = "";
318			reader.Begin(page);
319			RectTextSearch(reader, pos);
320			reader.End();
321			return _srch_str;
322		}
323	}
324}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6#include <PDF/PDFNet.h>
7#include <PDF/PDFDoc.h>
8#include <PDF/ElementReader.h>
9#include <PDF/Element.h>
10#include <PDF/Font.h>
11#include <Filters/FilterReader.h>
12#include <PDF/Image/Image2RGB.h>
13#include <PDF/TextExtractor.h>
14
15// This sample illustrates the basic text extraction capabilities of PDFNet.
16
17#include <iostream>
18#include "../../LicenseKey/CPP/LicenseKey.h"
19
20using namespace std;
21
22using namespace pdftron;
23using namespace PDF;
24using namespace SDF;
25using namespace Common;
26using namespace Filters; 
27
28// A utility method used to dump all text content in the console window.
29void DumpAllText(ElementReader& reader) 
30{
31	Element element; 
32	while ((element = reader.Next()) != 0)
33	{
34		switch (element.GetType()) 
35		{
36		case Element::e_text_begin: 
37			cout << "\n--> Text Block Begin\n";
38			break;
39		case Element::e_text_end:
40			cout << "\n--> Text Block End\n";
41			break;
42		case Element::e_text:
43			{
44				Rect bbox;
45				element.GetBBox(bbox);
46				cout << "\n--> BBox: " << bbox.x1 << ", " 
47									   << bbox.y1 << ", " 
48									   << bbox.x2 << ", " 
49									   << bbox.y2 << "\n";
50
51				UString arr = element.GetTextString();
52				cout << arr << "\n";
53			}
54			break;
55		case Element::e_text_new_line:
56			cout << "\n--> New Line\n";
57			break;
58		case Element::e_form:				// Process form XObjects
59			reader.FormBegin(); 
60			DumpAllText(reader);
61			reader.End(); 
62			break; 
63		}
64	}
65}
66
67// A helper method for ReadTextFromRect
68void RectTextSearch(ElementReader& reader, const Rect& pos, UString& srch_str) 
69{			
70	Element element; 
71	while (element = reader.Next())
72	{
73		switch (element.GetType()) 
74		{
75		case Element::e_text:
76			{
77				Rect bbox;
78				element.GetBBox(bbox);
79				if(bbox.IntersectRect(bbox, pos)) 
80				{
81					UString arr = element.GetTextString();
82					srch_str += arr;
83					srch_str += "\n"; // add a new line?
84				}
85				break;
86			}
87		case Element::e_text_new_line:
88			{
89				break;
90			}
91		case Element::e_form: // Process form XObjects
92			{
93				reader.FormBegin(); 
94				RectTextSearch(reader, pos, srch_str);
95				reader.End(); 
96				break; 
97			}
98		}
99	}
100}
101
102// A utility method used to extract all text content from
103// a given selection rectangle. The rectangle coordinates are
104// expressed in PDF user/page coordinate system.
105UString ReadTextFromRect(Page& page, const Rect& pos, ElementReader& reader)
106{
107	UString srch_str;
108	reader.Begin(page);
109	RectTextSearch(reader, pos, srch_str);
110	reader.End();
111	return srch_str;
112}
113
114
115void PrintStyle(TextExtractor::Style& s)
116{
117	UInt8 rgb[3];
118	char rgb_hex[24];
119
120	s.GetColor(rgb);
121	sprintf(rgb_hex, "%02X%02X%02X;", rgb[0], rgb[1], rgb[2]);
122	cout << " style=\"font-family:" << s.GetFontName() << "; "	<< "font-size:" << s.GetFontSize() << ";" 
123		 << (s.IsSerif() ? " sans-serif; " : " ") << "color:#" << rgb_hex << "\"";
124}
125
126int main(int argc, char *argv[])
127{
128	int ret = 0;
129	PDFNet::Initialize(LicenseKey);
130	// Relative path to the folder containing test files.
131	string input_path =  "../../TestFiles/newsletter.pdf";
132
133
134
135	
136	const char* filein = argc>1 ? argv[1] : input_path.c_str();
137
138	bool example1_basic = false;
139	bool example2_xml = false;
140	bool example3_wordlist = false;
141	bool example4_advanced  = true;
142	bool example5_low_level = false;
143
144	// Sample code showing how to use high-level text extraction APIs.
145	try
146	{
147		PDFDoc doc(filein);
148		doc.InitSecurityHandler();
149
150		Page page = doc.GetPage(1);
151		if (!page){
152			cout << "Page not found." << endl;
153			return 1;
154		}
155
156		TextExtractor txt;
157		txt.Begin(page); // Read the page.
158		// Other options you may want to consider...
159		// txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
160		// txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
161
162
163		// Example 1. Get all text on the page in a single string.
164		// Words will be separated with space or new line characters.
165		if (example1_basic) 
166		{
167			// Get the word count.
168			cout << "Word Count: " << txt.GetWordCount() << endl;
169
170			UString text;
171			txt.GetAsText(text);
172			cout << "\n\n- GetAsText --------------------------\n" << text << endl;
173			cout << "-----------------------------------------------------------" << endl;
174		}
175
176		// Example 2. Get XML logical structure for the page.
177		if (example2_xml) 
178		{
179			UString text;
180			txt.GetAsXML(text, TextExtractor::e_words_as_elements | TextExtractor::e_output_bbox | TextExtractor::e_output_style_info);
181			cout << "\n\n- GetAsXML  --------------------------\n" << text << endl;
182			cout << "-----------------------------------------------------------" << endl;
183		}
184
185		// Example 3. Extract words one by one.
186		if (example3_wordlist) 
187		{
188			UString text;
189			TextExtractor::Line line = txt.GetFirstLine();
190			TextExtractor::Word word;
191			for (; line.IsValid(); line=line.GetNextLine())	{
192				for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord()) {
193					text.Assign(word.GetString(), word.GetStringLen());
194					cout << text << '\n';
195				}
196			}
197			cout << "-----------------------------------------------------------" << endl;
198		}
199
200		// Example 4. A more advanced text extraction example. 
201		// The output is XML structure containing paragraphs, lines, words, 
202		// as well as style and positioning information.
203		if (example4_advanced) 
204		{
205			const double *b;
206			double q[8];
207			int cur_flow_id=-1, cur_para_id=-1;
208
209			UString uni_str;
210			TextExtractor::Line line;
211			TextExtractor::Word word;
212			TextExtractor::Style s, line_style;
213
214			cout << "<PDFText>\n";
215
216			// For each line on the page...
217			for (line=txt.GetFirstLine(); line.IsValid(); line=line.GetNextLine())
218			{
219				if ( line.GetNumWords() == 0 ) {
220					continue;
221				}
222
223				if (cur_flow_id != line.GetFlowID()) {
224					if (cur_flow_id != -1) {
225						if (cur_para_id != -1) {
226							cur_para_id = -1;
227							cout << "</Para>\n";
228						}
229						cout << "</Flow>\n";
230					}
231					cur_flow_id = line.GetFlowID();
232					cout << "<Flow id=\""<< cur_flow_id << "\">\n";
233				}
234
235				if (cur_para_id != line.GetParagraphID()) {
236					if (cur_para_id != -1)
237						cout << "</Para>\n";
238					cur_para_id = line.GetParagraphID();
239					cout << "<Para id=\""<< cur_para_id << "\">\n";
240				}	
241				
242				b = line.GetBBox();
243				line_style = line.GetStyle();
244				printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", b[0], b[1], b[2], b[3]);
245				PrintStyle(line_style);
246				cout << " cur_num=\"" << line.GetCurrentNum() << "\"";
247				cout << ">\n";
248
249				// For each word in the line...
250				for (word=line.GetFirstWord(); word.IsValid(); word=word.GetNextWord())
251				{
252					// Output the bounding box for the word.
253					word.GetBBox(q);
254					printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", q[0], q[1], q[2], q[3]);
255					cout << " cur_num=\"" << word.GetCurrentNum() << "\"";
256					int sz = word.GetStringLen();
257					if (sz == 0) continue;
258
259					// If the word style is different from the parent style, output the new style.
260					s = word.GetStyle();
261					if (s != line_style) {
262						PrintStyle(s);
263					}
264
265					uni_str.Assign(word.GetString(), sz);
266					cout << ">" << uni_str;
267					cout << "</Word>\n";
268				}
269				cout << "</Line>\n";
270			}
271
272			if (cur_flow_id != -1) {
273				if (cur_para_id != -1) {
274					cur_para_id = -1;
275					cout << "</Para>\n";
276				}
277				cout << "</Flow>\n";
278			}
279			cout << "</PDFText>\n";
280		}
281	}
282	catch(Exception& e)
283	{
284		cout << e << endl;
285		ret = 1;
286	}
287	catch(...)
288	{
289		cout << "Unknown Exception" << endl;
290		ret = 1;
291	}
292
293
294	if(example5_low_level)
295	{
296		try	
297		{
298			PDFDoc doc(filein);
299			doc.InitSecurityHandler();
300
301			// Example 1. Extract all text content from the document
302
303			ElementReader reader;
304			//  Read every page
305			for (PageIterator itr=doc.GetPageIterator(); itr.HasNext(); itr.Next()) 
306			{				
307				reader.Begin(itr.Current());
308				DumpAllText(reader);
309				reader.End();
310			}
311
312			// Example 2. Extract text content based on the 
313			// selection rectangle.
314			cout << "\n----------------------------------------------------";
315			cout << "\nExtract text based on the selection rectangle.";
316			cout << "\n----------------------------------------------------\n";
317
318			Page first_page = doc.GetPageIterator().Current();
319			UString s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader);
320			cout << "\nField 1: " << s1;
321
322			s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
323			cout << "\nField 2: " << s1;
324
325			s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
326			cout << "\nField 3: " << s1;
327
328			// ... 
329			cout << "Done." << endl;
330		}
331		catch(Exception& e)
332		{
333			cout << e << endl;
334			ret = 1;
335		}
336		catch(...)
337		{
338			cout << "Unknown Exception" << endl;
339			ret = 1;
340		}
341	}
342	PDFNet::Terminate();
343	return ret;
344}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2021 by PDFTron Systems Inc. All Rights Reserved.
3// Consult LICENSE.txt regarding license information.
4//---------------------------------------------------------------------------------------
5
6package main
7import (
8	"fmt"
9	"strconv"
10	"os"
11	. "pdftron"
12)
13
14import  "pdftron/Samples/LicenseKey/GO"
15
16func PrintStyle (style Style){
17    sansSerifStr := ""
18    if style.IsSerif(){
19        sansSerifStr = " sans-serif;"
20	}
21    rgb := style.GetColor()
22    rgbHex := fmt.Sprintf("%02X%02X%02X;", rgb.Get(0), rgb.Get(1), rgb.Get(2))
23    fontStr := fmt.Sprintf("%g", style.GetFontSize())
24    os.Stdout.Write([]byte(" style=\"font-family:" + style.GetFontName() + "; font-size:" + fontStr + ";" + sansSerifStr + " color:#" + rgbHex + "\""))
25}
26
27func DumpAllText (reader ElementReader){
28    element := reader.Next()
29
30    for element.GetMp_elem().Swigcptr() != 0{
31        etype := element.GetType()
32        if etype == ElementE_text_begin{
33            fmt.Println("Text Block Begin")
34        }else if etype == ElementE_text_end{
35            fmt.Println("Text Block End")
36        }else if etype == ElementE_text{
37            bbox := element.GetBBox()
38            fmt.Println("BBox: " + fmt.Sprintf("%f", bbox.GetX1()) + ", " + fmt.Sprintf("%f", bbox.GetY1()) + ", " +
39						fmt.Sprintf("%f", bbox.GetX2()) + ", " + fmt.Sprintf("%f", bbox.GetY2()))
40            textString := element.GetTextString()
41            fmt.Println(textString)
42        }else if etype == ElementE_text_new_line{
43            fmt.Println("New Line")
44        }else if etype == ElementE_form{
45            reader.FormBegin()
46            DumpAllText(reader)
47            reader.End()
48		}
49        element = reader.Next()
50	}
51}
52
53// A utility method used to extract all text content from
54// a given selection rectangle. The recnagle coordinates are
55// expressed in PDF user/page coordinate system.
56func ReadTextFromRect (page Page, pos Rect, reader ElementReader) string{
57    reader.Begin(page)
58    srchStr := RectTextSearch(reader, pos)
59    reader.End()
60    return srchStr
61}
62//A helper method for ReadTextFromRect
63func RectTextSearch (reader ElementReader, pos Rect) string{
64    element := reader.Next()
65    srchStr2 := ""
66    for element.GetMp_elem().Swigcptr() != 0{
67        etype := element.GetType()
68        if etype == ElementE_text{
69            bbox := element.GetBBox()
70            if (bbox.IntersectRect(bbox, pos)){
71                arr := element.GetTextString()
72                srchStr2 += arr
73                srchStr2 += "\n"
74			}
75        }else if etype == ElementE_text_new_line{
76            //handle text new line here
77        }else if etype == ElementE_form{
78            reader.FormBegin()
79            srchStr2 += RectTextSearch(reader, pos)
80            fmt.Println(srchStr2)
81            reader.End()
82		}
83        element = reader.Next()
84	}
85    return srchStr2
86}
87
88func main(){
89    PDFNetInitialize(PDFTronLicense.Key)
90    
91    // Relative path to the folder containing test files.
92    inputPath :=  "../../TestFiles/newsletter.pdf"
93    example1Basic := false
94    example2Xml := false
95    example3Wordlist := false
96    example4Advanced := true
97    example5LowLevel := false
98   
99    // Sample code showing how to use high-level text extraction APIs.
100    doc := NewPDFDoc(inputPath)
101    doc.InitSecurityHandler()
102    
103    page := doc.GetPage(1)
104    if page == nil{
105        fmt.Println("page no found")
106    }    
107    txt := NewTextExtractor()
108    txt.Begin(page) // Read the page
109    
110    // Example 1. Get all text on the page in a single string.
111    // Words will be separated witht space or new line characters.
112    if example1Basic{
113        fmt.Println("Word count: " + strconv.Itoa(txt.GetWordCount()))
114        txtAsText := txt.GetAsText()
115        fmt.Println("- GetAsText --------------------------" + txtAsText)
116        fmt.Println("-----------------------------------------------------------")
117	}
118    // Example 2. Get XML logical structure for the page.
119    if example2Xml{
120        text := txt.GetAsXML(TextExtractorE_words_as_elements | 
121                            TextExtractorE_output_bbox | 
122                            TextExtractorE_output_style_info)       
123        fmt.Println("- GetAsXML  --------------------------" + text)
124        fmt.Println("-----------------------------------------------------------")
125    }
126    // Example 3. Extract words one by one.
127    if example3Wordlist{
128        word := NewWord()
129        line := txt.GetFirstLine()
130        for line.IsValid(){
131            word = line.GetFirstWord()
132            for word.IsValid(){
133                wordString := word.GetString()
134                fmt.Println(wordString)
135                word = word.GetNextWord()
136			}
137            line = line.GetNextLine()
138		}
139        fmt.Println("-----------------------------------------------------------")
140	}
141    // Example 4. A more advanced text extraction example. 
142    // The output is XML structure containing paragraphs, lines, words, 
143    // as well as style and positioning information.
144    if example4Advanced{
145        bbox := NewRect()
146        curFlowId := -1
147        curParaId := -1
148        
149        fmt.Println("<PDFText>")
150        // For each line on the page...
151        line := txt.GetFirstLine()
152        for line.IsValid(){
153            if line.GetNumWords() == 0{
154                line = line.GetNextLine()			
155                continue
156			}
157            word := line.GetFirstWord()
158            if curFlowId != line.GetFlowID(){
159                if curFlowId != -1{
160                    if curParaId != -1{
161                        curParaId = -1
162                        fmt.Println("</Para>")
163					}
164                    fmt.Println("</Flow>")
165				}
166                curFlowId = line.GetFlowID()
167                fmt.Println("<Flow id=\"" + strconv.Itoa(curFlowId) +"\">")
168            }        
169            if curParaId != line.GetParagraphID(){
170                if curParaId != -1{
171                    fmt.Println("</Para>")
172				}
173                curParaId= line.GetParagraphID()
174                fmt.Println("<Para id=\"" +strconv.Itoa(curParaId)+ "\">")
175            }    
176            bbox = line.GetBBox()
177            lineStyle := line.GetStyle()
178            os.Stdout.Write([]byte(fmt.Sprintf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2())))
179            PrintStyle (lineStyle)
180            os.Stdout.Write([]byte(" cur_num=\"" + strconv.Itoa(line.GetCurrentNum()) + "\"" + ">\n"))
181            
182            // For each word in the line...
183            word = line.GetFirstWord()
184            for word.IsValid(){
185                // Output the bounding box for the word
186                bbox = word.GetBBox()
187				os.Stdout.Write([]byte(fmt.Sprintf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2())))
188                os.Stdout.Write([]byte(" cur_num=\"" + strconv.Itoa(word.GetCurrentNum()) + "\""));
189                sz := word.GetStringLen()
190                if sz == 0{
191                    word = word.GetNextWord()				
192                    continue
193				}
194                // If the word style is different from the parent style, output the new style.
195                s := word.GetStyle()
196                if !s.IsEqual(lineStyle){
197                    PrintStyle (s)
198				}
199                wordString := word.GetString()
200                os.Stdout.Write([]byte(">" + wordString + "</Word>\n"))
201                word = word.GetNextWord()
202			}
203            os.Stdout.Write([]byte("</Line>\n"))               
204            line = line.GetNextLine()
205        }    
206        if curFlowId != -1{
207            if curParaId != -1{
208                curParaId = -1
209                os.Stdout.Write([]byte("</Para>\n"))
210			}
211            os.Stdout.Write([]byte("</Flow>\n"))
212        }
213        txt.Destroy()
214        doc.Close()            
215        fmt.Println("</PDFText>")
216    }
217    // Sample code showing how to use low-level text extraction APIs.
218    if example5LowLevel{
219        doc = NewPDFDoc(inputPath)
220        doc.InitSecurityHandler()
221
222        // Example 1. Extract all text content from the document
223        
224        reader := NewElementReader()
225        itr := doc.GetPageIterator()
226        for itr.HasNext(){
227            reader.Begin(itr.Current())
228            DumpAllText(reader)
229            reader.End()
230            itr.Next()
231        }
232		
233        // Example 2. Extract text content based on the 
234        // selection rectangle.
235        
236        fmt.Println("----------------------------------------------------")
237        fmt.Println("Extract text based on the selection rectangle.")
238        fmt.Println("----------------------------------------------------")
239        
240        itr = doc.GetPageIterator()
241        firstPage := itr.Current()
242        s1 := ReadTextFromRect(firstPage, NewRect(27.0, 392.0, 563.0, 534.0), reader)
243        fmt.Println("Field 1: " + s1)
244
245        s1 = ReadTextFromRect(firstPage, NewRect(28.0, 551.0, 106.0, 623.0), reader);
246        fmt.Println("Field 2: " + s1)
247
248        s1 = ReadTextFromRect(firstPage, NewRect(208.0, 550.0, 387.0, 621.0), reader);
249        fmt.Println("Field 3: " + s1)
250        
251        doc.Close()
252        PDFNetTerminate()
253        fmt.Println("Done.")
254	}
255}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6import java.awt.Color;
7import com.pdftron.common.PDFNetException;
8import com.pdftron.pdf.*;
9import java.text.DecimalFormat; 
10
11
12// This sample illustrates the basic text extraction capabilities of PDFNet.
13public class TextExtractTest {
14
15    public static void main(String[] args) {
16        PDFNet.initialize(PDFTronLicense.Key());
17
18        // Relative path to the folder containing test files.
19        String input_path = "../../TestFiles/";
20        // string output_path = "../../TestFiles/Output/";
21        boolean example1_basic = false;
22        boolean example2_xml = false;
23        boolean example3_wordlist = false;
24        boolean example4_advanced = true;
25        boolean example5_low_level = false;
26
27        // Sample code showing how to use high-level text extraction APIs.
28        try (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) {
29            doc.initSecurityHandler();
30
31            Page page = doc.getPage(1);
32            if (page == null) {
33                System.out.println("Page not found.");
34            }
35
36            TextExtractor txt = new TextExtractor();
37            txt.begin(page);  // Read the page.
38            // Other options you may want to consider...
39            // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_no_dup_remove);
40            // txt.Begin(page, 0, TextExtractor.ProcessingFlags.e_remove_hidden_text);
41            // ...
42
43            // Example 1. Get all text on the page in a single string.
44            // Words will be separated with space or new line characters.
45            if (example1_basic) {
46                // Get the word count.
47                System.out.println("Word Count: " + txt.getWordCount());
48
49                System.out.println("\n\n- GetAsText --------------------------\n" + txt.getAsText());
50                System.out.println("-----------------------------------------------------------");
51            }
52
53            // Example 2. Get XML logical structure for the page.
54            if (example2_xml) {
55                String text = txt.getAsXML(TextExtractor.e_words_as_elements | TextExtractor.e_output_bbox | TextExtractor.e_output_style_info);
56                System.out.println("\n\n- GetAsXML  --------------------------\n" + text);
57                System.out.println("-----------------------------------------------------------");
58            }
59
60            // Example 3. Extract words one by one.
61            if (example3_wordlist) {
62                TextExtractor.Word word;
63                for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
64                    for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
65                        System.out.println(word.getString());
66                    }
67                }
68                System.out.println("-----------------------------------------------------------");
69            }
70
71            // Example 4. A more advanced text extraction example.
72            // The output is XML structure containing paragraphs, lines, words,
73            // as well as style and positioning information.
74            if (example4_advanced) {
75                Rect bbox;
76                int cur_flow_id = -1, cur_para_id = -1;
77
78                TextExtractor.Line line;
79                TextExtractor.Word word;
80                TextExtractor.Style s, line_style;
81
82                System.out.println("<PDFText>");
83                // For each line on the page...
84                for (line = txt.getFirstLine(); line.isValid(); line = line.getNextLine()) {
85                    if (line.getNumWords() == 0)
86                        continue;
87                    if (cur_flow_id != line.getFlowID()) {
88                        if (cur_flow_id != -1) {
89                            if (cur_para_id != -1) {
90                                cur_para_id = -1;
91                                System.out.println("</Para>");
92                            }
93                            System.out.println("</Flow>");
94                        }
95                        cur_flow_id = line.getFlowID();
96                        System.out.println("<Flow id=\"" + cur_flow_id + "\">");
97                    }
98
99                    if (cur_para_id != line.getParagraphID()) {
100                        if (cur_para_id != -1)
101                            System.out.println("</Para>");
102                        cur_para_id = line.getParagraphID();
103                        System.out.println("<Para id=\"" + cur_para_id + "\">");
104                    }
105
106                    bbox = line.getBBox();
107                    line_style = line.getStyle();
108                    System.out.print("<Line box=\"" +  String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
109                    printStyle(line_style);
110                    System.out.println(" cur_num=\"" + line.getCurrentNum() + "\">");
111                 
112
113                    // For each word in the line...
114                    for (word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
115                        // Output the bounding box for the word.
116                        bbox = word.getBBox();
117                        System.out.print("<Word box=\"" +  String.format("%.2f, %.2f, %.2f, %.2f", bbox.getX1(), bbox.getY1(), bbox.getX2(), bbox.getY2()) + "\"");
118                        System.out.print(" cur_num=\"" + word.getCurrentNum() + "\"");
119                        int sz = word.getStringLen();
120                        if (sz == 0) continue;
121
122                        // If the word style is different from the parent style, output the new style.
123                        s = word.getStyle();
124                        if (!s.equals(line_style)) {
125                            printStyle(s);
126                        }
127
128                        System.out.print(">" + word.getString());
129                        System.out.println("</Word>");
130                    }
131                    System.out.println("</Line>");
132                }
133
134                if (cur_flow_id != -1) {
135                    if (cur_para_id != -1) {
136                        cur_para_id = -1;
137                        System.out.println("</Para>");
138                    }
139                    System.out.println("</Flow>");
140                }
141            }
142            txt.destroy();
143            System.out.println("</PDFText>");
144        } catch (PDFNetException e) {
145            System.out.println(e);
146        }
147
148        // Sample code showing how to use low-level text extraction APIs.
149        if (example5_low_level) {
150            try (PDFDoc doc = new PDFDoc((input_path + "newsletter.pdf"))) {
151                doc.initSecurityHandler();
152
153                // Example 1. Extract all text content from the document
154
155                ElementReader reader = new ElementReader();
156                //  Read every page
157                for (PageIterator itr = doc.getPageIterator(); itr.hasNext(); ) {
158                    reader.begin(itr.next());
159                    DumpAllText(reader);
160                    reader.end();
161                }
162
163                // Example 2. Extract text content based on the
164                // selection rectangle.
165                System.out.print("\n----------------------------------------------------");
166                System.out.print("\nExtract text based on the selection rectangle.");
167                System.out.println("\n----------------------------------------------------");
168
169                Page first_page = doc.getPageIterator().next();
170                String s1 = ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
171                System.out.print("\nField 1: " + s1);
172
173                s1 = ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
174                System.out.print("\nField 2: " + s1);
175
176                s1 = ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);
177                System.out.print("\nField 3: " + s1);
178
179                // ...
180                System.out.println("Done.");
181            } catch (Exception e) {
182                e.printStackTrace();
183            }
184        }
185
186        PDFNet.terminate();
187    }
188
189
190    static void printStyle(TextExtractor.Style s) {
191        Color rgb = s.getColor();
192        String rgb_hex =  String.format("%02X%02X%02X;", rgb.getRed(), rgb.getGreen(), rgb.getBlue() );
193        DecimalFormat df = new DecimalFormat("#.#");
194        System.out.print(" style=\"font-family:" + s.getFontName() + "; "
195                + "font-size:" + df.format(s.getFontSize()) + ";"
196                + (s.isSerif() ? " sans-serif; " : " ")
197                + "color:#" + rgb_hex + "\"");
198    }
199
200    // A utility method used to dump all text content in the console window.
201    static void DumpAllText(ElementReader reader) throws PDFNetException {
202        Element element;
203        while ((element = reader.next()) != null) {
204            switch (element.getType()) {
205                case Element.e_text_begin:
206                    System.out.println("\n--> Text Block Begin");
207                    break;
208                case Element.e_text_end:
209                    System.out.println("\n--> Text Block End");
210                    break;
211                case Element.e_text: {
212                    Rect bbox = element.getBBox();
213                    if (bbox == null) continue;
214                    System.out.println("\n--> BBox: " + bbox.getX1() + ", "
215                            + bbox.getY1() + ", "
216                            + bbox.getX2() + ", "
217                            + bbox.getY2());
218
219                    String arr = element.getTextString();
220                    System.out.println(arr);
221                }
222                break;
223                case Element.e_text_new_line:
224                    System.out.println("\n--> New Line");
225                    break;
226                case Element.e_form:                // Process form XObjects
227                    reader.formBegin();
228                    DumpAllText(reader);
229                    reader.end();
230                    break;
231            }
232        }
233    }
234
235    // A helper method for ReadTextFromRect
236    static String RectTextSearch(ElementReader reader, Rect pos) throws PDFNetException {
237        Element element;
238        String srch_str = new String();
239        while ((element = reader.next()) != null) {
240            switch (element.getType()) {
241                case Element.e_text: {
242                    Rect bbox = element.getBBox();
243                    if (bbox == null) continue;
244                    if (bbox.intersectRect(bbox, pos)) {
245                        String arr = element.getTextString();
246                        srch_str += arr;
247                        srch_str += "\n"; // add a new line?
248                    }
249                    break;
250                }
251                case Element.e_text_new_line: {
252                    break;
253                }
254                case Element.e_form: // Process form XObjects
255                {
256                    reader.formBegin();
257                    srch_str += RectTextSearch(reader, pos);
258                    reader.end();
259                    break;
260                }
261            }
262        }
263        return srch_str;
264    }
265
266    // A utility method used to extract all text content from
267    // a given selection rectangle. The rectangle coordinates are
268    // expressed in PDF user/page coordinate system.
269    static String ReadTextFromRect(Page page, Rect pos, ElementReader reader) throws PDFNetException {
270        reader.begin(page);
271        String srch_str = RectTextSearch(reader, pos);
272        reader.end();
273        return srch_str;
274    }
275}

1//---------------------------------------------------------------------------------------
2// Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3// Consult legal.txt regarding legal and license information.
4//---------------------------------------------------------------------------------------
5
6
7const { PDFNet } = require('@pdftron/pdfnet-node');
8const PDFTronLicense = require('../LicenseKey/LicenseKey');
9
10((exports) => {
11
12  exports.runTextExtractTest = async () => {
13    // A utility method used to dump all text content in the console window.
14    const dumpAllText = async (reader) => {
15      let element;
16      let bbox;
17      let arr;
18      while ((element = await reader.next()) !== null) {
19        switch (await element.getType()) {
20          case PDFNet.Element.Type.e_text_begin:
21            console.log('\n--> Text Block Begin');
22            break;
23          case PDFNet.Element.Type.e_text_end:
24            console.log('\n--> Text Block End');
25            break;
26          case PDFNet.Element.Type.e_text:
27            bbox = await element.getBBox();
28            console.log('\n--> BBox: ' + bbox.x1.toFixed(2) + ', ' + bbox.y1.toFixed(2) + ', ' + bbox.x2.toFixed(2) + ', ' + bbox.y2.toFixed(2) + '\n');
29            arr = await element.getTextString();
30            console.log(arr);
31            break;
32          case PDFNet.Element.Type.e_text_new_line:
33            console.log('\n--> New Line');
34            break;
35          case PDFNet.Element.Type.e_form:
36            reader.formBegin();
37            await dumpAllText(reader);
38            reader.end();
39            break;
40        }
41      }
42    };
43
44    // helper method for ReadTextFromRect
45    const rectTextSearch = async (reader, pos, srchStr) => {
46      let element;
47      let arr;
48      while ((element = await reader.next()) !== null) {
49        let bbox;
50        switch (await element.getType()) {
51          case PDFNet.Element.Type.e_text:
52            bbox = await element.getBBox();
53            if (await bbox.intersectRect(bbox, pos)) {
54              arr = await element.getTextString();
55              srchStr += arr + '\n';
56            }
57            break;
58          case PDFNet.Element.Type.e_text_new_line:
59            break;
60          case PDFNet.Element.Type.e_form:
61            reader.formBegin();
62            srchStr += await rectTextSearch(reader, pos, srchStr); // possibly need srchStr = ...
63            reader.end();
64            break;
65        }
66      }
67      return srchStr;
68    };
69
70    const readTextFromRect = async (page, pos, reader) => {
71      let srchStr = '';
72      reader.beginOnPage(page); // uses default parameters.
73      srchStr += await rectTextSearch(reader, pos, srchStr);
74      reader.end();
75      return srchStr;
76    };
77
78    const twoDigitHex = function (num) {
79      const hexStr = num.toString(16).toUpperCase();
80      return ('0' + hexStr).substr(-2);
81    }
82
83    const printStyle = async (s) => {
84      const rgb = await s.getColor();
85      const rColorVal = await rgb.get(0);
86      const gColorVal = await rgb.get(1);
87      const bColorVal = await rgb.get(2);
88      const rgbHex = twoDigitHex(rColorVal) + twoDigitHex(gColorVal) + twoDigitHex(bColorVal)
89      const fontName = await s.getFontName();
90      const fontSize = await s.getFontSize();
91      const serifOutput = ((await s.isSerif()) ? ' sans-serif; ' : ' ');
92      const returnString = ' style="font-family:' + fontName + '; font-size:' + fontSize + ';' + serifOutput + 'color:#' + rgbHex + ';"';
93      return returnString;
94    };
95
96    const main = async () => {
97      // eslint-disable-next-line no-unused-vars
98      let ret = 0;
99
100      // Relative path to the folder containing test files.
101      const inputPath = '../TestFiles/';
102      const inputFilename = 'newsletter.pdf'; // addimage.pdf, newsletter.pdf
103
104      const example1Basic = false;
105      const example2XML = false;
106      const example3Wordlist = false;
107      const example4Advanced = true;
108      const example5LowLevel = false;
109
110      try {
111        await PDFNet.startDeallocateStack();
112        const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
113        doc.initSecurityHandler();
114
115        const page = await doc.getPage(1);
116
117        if (page.id === '0') {
118          console.log('Page not found.');
119          return 1;
120        }
121
122        const txt = await PDFNet.TextExtractor.create();
123        txt.begin(page);
124
125        let text;
126        let line;
127        let word;
128
129        // Example 1. Get all text on the page in a single string.
130        // Words will be separated with space or new line characters.
131        if (example1Basic) {
132          const wordCount = await txt.getWordCount();
133          console.log('Word Count: ' + wordCount);
134          text = await txt.getAsText();
135          console.log('\n\n- GetAsText --------------------------');
136          console.log(text);
137          console.log('-----------------------------------------------------------');
138        }
139
140        // Example 2. Get XML logical structure for the page.
141        if (example2XML) {
142          text = await txt.getAsXML(PDFNet.TextExtractor.XMLOutputFlags.e_words_as_elements | PDFNet.TextExtractor.XMLOutputFlags.e_output_bbox | PDFNet.TextExtractor.XMLOutputFlags.e_output_style_info);
143          console.log('\n\n- GetAsXML  --------------------------\n' + text);
144          console.log('-----------------------------------------------------------');
145        }
146
147        // Example 3. Extract words one by one.
148        if (example3Wordlist) {
149          line = await txt.getFirstLine();
150          for (; (await line.isValid()); line = (await line.getNextLine())) {
151            for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
152              text = await word.getString();
153              console.log(text);
154            }
155          }
156          console.log('-----------------------------------------------------------');
157        }
158
159        // Example 4. A more advanced text extraction example. 
160        // The output is XML structure containing paragraphs, lines, words, 
161        // as well as style and positioning information.
162        if (example4Advanced) {
163          let b;
164          let q;
165          let curFlowID = -1;
166          let curParaID = -1;
167
168          console.log('<PDFText>');
169
170          // For each line on the page...
171          for (line = await txt.getFirstLine(); await line.isValid(); line = await line.getNextLine()) {
172            if ((await line.getNumWords()) === 0) {
173              continue;
174            }
175            if (curFlowID !== await line.getFlowID()) {
176              if (curFlowID !== -1) {
177                if (curParaID !== -1) {
178                  curParaID = -1;
179                  console.log('</Para>');
180                }
181                console.log('</Flow>');
182              }
183              curFlowID = await line.getFlowID();
184              console.log('<Flow id="' + curFlowID + '">');
185            }
186            if (curParaID !== await line.getParagraphID()) {
187              if (curParaID !== -1) {
188                console.log('</Para>');
189              }
190              curParaID = await line.getParagraphID();
191              console.log('<Para id="' + curParaID + '">');
192            }
193            b = await line.getBBox();
194            const lineStyle = await line.getStyle();
195            let outputStringLineBox = '<Line box="' + b.x1.toFixed(2) + ', ' + b.y1.toFixed(2) + ', ' + b.x2.toFixed(2) + ', ' + b.y2.toFixed(2) + '"';
196            outputStringLineBox += (await printStyle(lineStyle));
197            const currentLineNum = await line.getCurrentNum();
198            outputStringLineBox += ' cur_num="' + currentLineNum + '">';
199            console.log(outputStringLineBox);
200
201            // For each word in the line...
202            for (word = await line.getFirstWord(); await word.isValid(); word = await word.getNextWord()) {
203              // output bounding box for the word
204              q = await word.getBBox();
205              const currentNum = await word.getCurrentNum();
206              let outputStringWord = '<Word box="' + q.x1.toFixed(2) + ', ' + q.y1.toFixed(2) + ', ' + q.x2.toFixed(2) + ', ' + q.y2.toFixed(2) + '" cur_num="' + currentNum + '"';
207              const sz = await word.getStringLen();
208              if (sz === 0) {
209                continue;
210              }
211              // if the word style is different from the parent style, output the new style
212              const sty = await word.getStyle();
213              if (!(await sty.compare(lineStyle))) {
214                outputStringWord += await printStyle(sty);
215              }
216              outputStringWord += '>' + (await word.getString()) + '</Word>';
217              console.log(outputStringWord);
218            }
219            console.log('</Line>');
220          }
221          if (curFlowID !== -1) {
222            if (curParaID !== -1) {
223              curParaID = -1;
224              console.log('</Para>');
225            }
226            console.log('</Flow>');
227          }
228          console.log('</PDFText>');
229        }
230        await PDFNet.endDeallocateStack();
231      } catch (err) {
232        console.log(err);
233        console.log(err.stack);
234        ret = 1;
235      }
236
237
238      if (example5LowLevel) {
239        ret = 0;
240        try {
241          await PDFNet.startDeallocateStack();
242          const doc = await PDFNet.PDFDoc.createFromFilePath(inputPath + inputFilename);
243          doc.initSecurityHandler();
244
245          // Example 1. Extract all text content from the document
246          const reader = await PDFNet.ElementReader.create();
247          const itr = await doc.getPageIterator(1);
248
249          //  Read every page
250          for (itr; await itr.hasNext(); itr.next()) {
251            const page = await itr.current();
252            reader.beginOnPage(page);
253            await dumpAllText(reader);
254            reader.end();
255          }
256          // Example 2. Extract text content based on the
257          // selection rectangle.
258          console.log('\n----------------------------------------------------');
259          console.log('Extract text based on the selection rectangle.');
260          console.log('----------------------------------------------------');
261
262
263          const firstPage = await (await doc.getPageIterator()).current();
264          let s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(27, 392, 563, 534)), reader);
265          console.log('\nField 1: ' + s1);
266
267          s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(28, 551, 106, 623)), reader);
268          console.log('Field 2: ' + s1);
269
270          s1 = await readTextFromRect(firstPage, (await PDFNet.Rect.init(208, 550, 387, 621)), reader);
271          console.log('Field 3: ' + s1);
272
273          // ...
274          console.log('Done');
275          await PDFNet.endDeallocateStack();
276        } catch (err) {
277          console.log(err.stack);
278          ret = 1;
279        }
280      }
281    };
282    PDFNet.runWithCleanup(main, PDFTronLicense.Key).catch(function (error) { console.log('Error: ' + JSON.stringify(error)); }).then(function () { return PDFNet.shutdown(); });
283  };
284  exports.runTextExtractTest();
285})(exports);
286// eslint-disable-next-line spaced-comment
287//# sourceURL=TextExtractTest.js

1<?php
2//---------------------------------------------------------------------------------------
3// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
4// Consult LICENSE.txt regarding license information.
5//---------------------------------------------------------------------------------------
6if(file_exists("../../../PDFNetC/Lib/PDFNetPHP.php"))
7include("../../../PDFNetC/Lib/PDFNetPHP.php");
8include("../../LicenseKey/PHP/LicenseKey.php");
9
10// Relative path to the folder containing the test files.
11$input_path = getcwd()."/../../TestFiles/newsletter.pdf";
12
13//---------------------------------------------------------------------------------------
14// This sample illustrates the basic text extraction capabilities of PDFNet.
15//---------------------------------------------------------------------------------------
16
17// A utility method used to dump all text content in the browser.
18function DumpAllText($reader) 
19{
20	while (($element = $reader->Next()) != NULL)
21	{
22		switch ($element->GetType()) 
23		{
24		case Element::e_text_begin: 
25			echo nl2br("\n--> Text Block Begin\n");
26			break;
27		case Element::e_text_end:
28			echo nl2br("\n--> Text Block End\n");
29			break;
30		case Element::e_text:
31			{
32				$bbox = $element->GetBBox();
33				echo nl2br("\n--> BBox: ".$bbox->x1.", "
34							.$bbox->y1.", " 
35							.$bbox->x2.", " 
36							.$bbox->y2."\n");
37
38				$arr = $element->GetTextString();
39				echo nl2br($arr."\n");
40			}
41			break;
42		case Element::e_text_new_line:
43			echo nl2br("\n--> New Line\n");
44			break;
45		case Element::e_form:				// Process form XObjects
46			$reader->FormBegin(); 
47            		DumpAllText(reader);
48			$reader->End(); 
49			break; 
50		}
51	}
52}
53
54// A helper method for ReadTextFromRect
55function RectTextSearch($reader, $pos) 
56{		
57	$srch_str = "";	
58	while (($element = $reader->Next()) != null)
59	{
60		switch ($element->GetType()) 
61		{
62		case Element::e_text:
63			{
64				$bbox = $element->GetBBox();
65				if($bbox->IntersectRect($bbox, $pos)) 
66				{
67					$arr = $element->GetTextString();
68					$srch_str .= $arr;
69					$srch_str .= nl2br("\n");
70				}
71				break;
72			}
73		case Element::e_text_new_line:
74			{
75				break;
76			}
77		case Element::e_form: // Process form XObjects
78			{
79				$reader->FormBegin(); 
80				$srch_str .= RectTextSearch($reader, $pos);
81				$reader->End(); 
82				break; 
83			}
84		}
85	}
86	return $srch_str;
87}
88
89// A utility method used to extract all text content from
90// a given selection rectangle. The rectangle coordinates are
91// expressed in PDF user/page coordinate system.
92function ReadTextFromRect($page, $pos, $reader)
93{
94	$reader->Begin($page);
95	$str = RectTextSearch($reader, $pos);
96	$reader->End();
97	return $str;
98}
99
100function PrintStyle($style)
101{
102	$text_color = $style->GetColor();
103	$tmp = sprintf("%02X%02X%02X;", $text_color[0], $text_color[1], $text_color[2]);
104	echo " style=\"font-family:".$style->GetFontName()."; "
105		."font-size:".$style->GetFontSize().";" 
106		.($style->IsSerif() ? " sans-serif; " : " ")
107		."color:#".$tmp."\"";
108}
109
110function IsStyleEqual($style1, $style2)
111{
112	if($style1->GetFontName() == $style2->GetFontName() && 
113		$style1->GetFontSize() == $style1->GetFontSize() && 
114		!($style1->IsSerif() xor $style1->IsSerif()) &&
115		$style1->GetColor() == $style2->GetColor() ) {
116		return true;
117	}
118	return false; 
119}
120//---------------------------------------------------------------------------------------
121
122	PDFNet::Initialize($LicenseKey);
123	PDFNet::GetSystemFontList();    // Wait for fonts to be loaded if they haven't already. This is done because PHP can run into errors when shutting down if font loading is still in progress.
124
125	$example1_basic     = false;
126	$example2_xml       = false;
127	$example3_wordlist  = false;
128	$example4_advanced  = true;
129	$example5_low_level = false;
130
131	// Sample code showing how to use high-level text extraction APIs.
132	
133	$doc = new PDFDoc($input_path);
134	$doc->InitSecurityHandler();
135
136	$page = $doc->GetPage(1);
137	if (!$page){
138		echo nl2br("Page not found.\n");
139		return;
140	}
141
142	$txt = new TextExtractor();
143	$txt->Begin($page); // Read the page.
144	// Other options you may want to consider...
145	// txt.Begin(*itr, 0, TextExtractor::e_no_dup_remove);
146	// txt.Begin(*itr, 0, TextExtractor::e_remove_hidden_text);
147
148	// Example 1. Get all text on the page in a single string.
149	// Words will be separated with space or new line characters.
150	if ($example1_basic) 
151	{
152		// Get the word count.
153		echo "Word Count: ".$txt->GetWordCount()."\n";
154
155		$text = $txt->GetAsText();
156		echo nl2br("\n\n- GetAsText --------------------------\n".$text."\n");
157		echo nl2br("-----------------------------------------------------------\n");
158	}
159
160	// Example 2. Get XML logical structure for the page.
161	if ($example2_xml) 
162	{
163		$text = $txt->GetAsXML(TextExtractor::e_words_as_elements | TextExtractor::e_output_bbox | TextExtractor::e_output_style_info);
164		echo nl2br("\n\n- GetAsXML  --------------------------\n".$text."\n");
165		echo nl2br("-----------------------------------------------------------\n");
166	}
167
168	// Example 3. Extract words one by one.
169	if ($example3_wordlist) 
170	{
171		for ($line = $txt->GetFirstLine(); $line->IsValid(); $line=$line->GetNextLine())	{
172			for ($word=$line->GetFirstWord(); $word->IsValid(); $word=$word->GetNextWord()) {
173				echo nl2br($word->GetString()."\n");
174			}
175		}
176		echo nl2br("-----------------------------------------------------------\n");
177	}
178
179	// Example 4. A more advanced text extraction example. 
180	// The output is XML structure containing paragraphs, lines, words, 
181	// as well as style and positioning information.
182	if ($example4_advanced) 
183	{
184		$cur_flow_id=-1;
185		$cur_para_id=-1;
186
187		echo nl2br("<PDFText>\n");
188		// For each line on the page...
189		for ($line=$txt->GetFirstLine(); $line->IsValid(); $line=$line->GetNextLine())
190		{
191			if ($line->GetNumWords() == 0) continue;
192			
193			if ($cur_flow_id != $line->GetFlowID()) {
194				if ($cur_flow_id != -1) {
195					if ($cur_para_id != -1) {
196						$cur_para_id = -1;
197						echo nl2br("</Para>\n");
198					}
199					echo nl2br("</Flow>\n");
200				}
201				$cur_flow_id = $line->GetFlowID();
202				echo nl2br("<Flow id=\"".$cur_flow_id."\">\n");
203			}
204
205			if ($cur_para_id != $line->GetParagraphID()) {
206				if ($cur_para_id != -1)
207					echo nl2br("</Para>\n");
208				$cur_para_id = $line->GetParagraphID();
209				echo nl2br("<Para id=\"".$cur_para_id."\">\n");
210			}	
211
212			$bbox1 = $line->GetBBox();
213			$line_style = $line->GetStyle();
214			printf("<Line box=\"%.2f, %.2f, %.2f, %.2f\"", $bbox1->x1, $bbox1->y1, $bbox1->x2, $bbox1->y2);
215			PrintStyle($line_style);
216			echo  " cur_num=\"".$line->GetCurrentNum()."\"";
217			echo nl2br(">\n");
218
219			// For each word in the line...
220			for ($word=$line->GetFirstWord(); $word->IsValid(); $word=$word->GetNextWord())
221			{
222				// Output the bounding box for the word.
223				$bbox2 = $word->GetBBox();
224				printf("<Word box=\"%.2f, %.2f, %.2f, %.2f\"", $bbox2->x1, $bbox2->y1, $bbox2->x2, $bbox2->y2);
225				echo " cur_num=\"" .$word->GetCurrentNum()."\"";
226				$sz = $word->GetStringLen();
227				if ($sz == 0) continue;
228
229				// If the word style is different from the parent style, output the new style.
230				$s = $word->GetStyle();
231				if(!$s->IsEqual($line_style)){
232					PrintStyle($s);
233				}
234				
235				echo ">".$word->GetString();
236				echo nl2br("</Word>\n");
237			}
238			echo nl2br("</Line>\n");
239		}
240
241		if ($cur_flow_id != -1) {
242			if ($cur_para_id != -1) {
243				$cur_para_id = -1;
244				echo nl2br("</Para>\n");
245			}
246			echo nl2br("</Flow>\n");
247
248
249		}
250		echo nl2br("</PDFText>\n");
251
252		$txt->Destroy();
253		$doc->Close();
254
255	}
256
257	if($example5_low_level)
258	{
259		$doc = new PDFDoc($input_path);
260		$doc->InitSecurityHandler();
261
262		// Example 1. Extract all text content from the document
263
264		$reader = new ElementReader();
265
266		//  Read every page
267		for ($itr=$doc->GetPageIterator(); $itr->HasNext(); $itr->Next()) 
268		{
269			$reader->Begin($itr->Current());
270			DumpAllText($reader);
271			$reader->End();
272		}
273
274		// Example 2. Extract text content based on the 
275		// selection rectangle.
276		echo nl2br("\n----------------------------------------------------");
277		echo nl2br("\nExtract text based on the selection rectangle.");
278		echo nl2br("\n----------------------------------------------------\n");
279
280		$first_page = $doc->GetPage(1);
281		$s1 = ReadTextFromRect($first_page, new Rect(27.0, 392.0, 563.0, 534.0), $reader);
282		echo nl2br("\nField 1: ".$s1);
283
284		$s1 = ReadTextFromRect($first_page, new Rect(28.0, 551.0, 106.0, 623.0), $reader);
285		echo nl2br("\nField 2: ".$s1);
286
287		$s1 = ReadTextFromRect($first_page, new Rect(208.0, 550.0, 387.0, 621.0), $reader);
288		echo nl2br("\nField 3: ".$s1);
289
290		// ... 
291		$doc->Close();
292		echo nl2br("Done.\n");
293	}
294	PDFNet::Terminate();
295?>

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6import site
7site.addsitedir("../../../PDFNetC/Lib")
8import sys
9from PDFNetPython import *
10
11sys.path.append("../../LicenseKey/PYTHON")
12from LicenseKey import *
13
14 
15def printStyle (style):
16    sans_serif_str = ""
17    if style.IsSerif():
18        sans_serif_str = " sans-serif;"
19    rgb = style.GetColor()
20    rgb_hex = "%02X%02X%02X;" % (rgb[0], rgb[1], rgb[2])
21    font_str = '%g' % style.GetFontSize()
22    sys.stdout.write(" style=\"font-family:" + style.GetFontName() + "; font-size:" 
23          + font_str + ";" + sans_serif_str + " color:#" + rgb_hex + "\"")
24
25def dumpAllText (reader):
26    element = reader.Next()
27    while element != None:
28        type = element.GetType()
29        if type == Element.e_text_begin:
30            print("Text Block Begin")
31        elif type == Element.e_text_end:
32            print("Text Block End")
33        elif type == Element.e_text:
34            bbox = element.GetBBox()
35            print("BBox: " + str(bbox.GetX1()) + ", " + str(bbox.GetY1()) + ", " 
36                  + str(bbox.GetX2()) + ", " + str(bbox.GetY2()))
37            textString = element.GetTextString()
38            print(textString)
39        elif type == Element.e_text_new_line:
40            print("New Line")
41        elif type == Element.e_form:
42            reader.FormBegin()
43            dumpAllText(reader)
44            reader.End()
45        element = reader.Next()
46
47# A utility method used to extract all text content from
48# a given selection rectangle. The recnagle coordinates are
49# expressed in PDF user/page coordinate system.
50def ReadTextFromRect (page, pos, reader):
51    reader.Begin(page)
52    srch_str = RectTextSearch(reader, pos)
53    reader.End()
54    return srch_str
55
56#A helper method for ReadTextFromRect
57def RectTextSearch (reader, pos):
58    element = reader.Next()
59    srch_str2 = ""
60    while element != None:
61        type = element.GetType()
62        if type == Element.e_text:
63            bbox = element.GetBBox()
64            if (bbox.IntersectRect(bbox, pos)):
65                arr = element.GetTextString()
66                srch_str2 += arr
67                srch_str2 += "\n"
68        elif type == Element.e_text_new_line:
69            None
70        elif type == Element.e_form:
71            reader.FormBegin()
72            srch_str2 += RectTextSearch(reader, pos)
73            print(srch_str2)
74            reader.End()
75        element = reader.Next()
76    return srch_str2
77            
78
79def main():
80    PDFNet.Initialize(LicenseKey)
81    
82    # Relative path to the folder containing test files.
83    input_path =  "../../TestFiles/newsletter.pdf"
84    example1_basic = False
85    example2_xml = False
86    example3_wordlist = False
87    example4_advanced = True
88    example5_low_level = False
89   
90    # Sample code showing how to use high-level text extraction APIs.
91    doc = PDFDoc(input_path)
92    doc.InitSecurityHandler()
93    
94    page = doc.GetPage(1)
95    if page == None:
96        print("page no found")
97        
98    txt = TextExtractor()
99    txt.Begin(page) # Read the page
100    
101    # Example 1. Get all text on the page in a single string.
102    # Words will be separated witht space or new line characters.
103    if example1_basic:
104        print("Word count: " + str(txt.GetWordCount()))
105        txtAsText = txt.GetAsText()
106        print("- GetAsText --------------------------" + txtAsText)
107        print("-----------------------------------------------------------")
108   
109    # Example 2. Get XML logical structure for the page.
110    if example2_xml:
111        text = txt.GetAsXML(TextExtractor.e_words_as_elements | 
112                            TextExtractor.e_output_bbox | 
113                            TextExtractor.e_output_style_info)       
114        print("- GetAsXML  --------------------------" + text)
115        print("-----------------------------------------------------------")
116    
117    # Example 3. Extract words one by one.
118    if example3_wordlist:
119        word = Word()
120        line = txt.GetFirstLine()
121        while line.IsValid():
122            word = line.GetFirstWord()
123            while word.IsValid():
124                wordString = word.GetString()
125                print(wordString)
126                word = word.GetNextWord()
127            line = line.GetNextLine()
128        print("-----------------------------------------------------------")
129 
130    # Example 4. A more advanced text extraction example. 
131    # The output is XML structure containing paragraphs, lines, words, 
132    # as well as style and positioning information.
133    if example4_advanced:
134        bbox = Rect();
135        cur_flow_id = -1
136        cur_para_id = -1
137        
138        print("<PDFText>")
139        # For each line on the page...
140        line = txt.GetFirstLine()
141        while line.IsValid():
142            if line.GetNumWords() == 0:
143                line = line.GetNextLine()			
144                continue
145            word = line.GetFirstWord()
146            if cur_flow_id != line.GetFlowID():
147                if cur_flow_id != -1:
148                    if cur_para_id != -1:
149                        cur_para_id = -1;
150                        print("</Para>")
151                    print("</Flow>")
152                cur_flow_id = line.GetFlowID()
153                print("<Flow id=\"" + str(cur_flow_id) +"\">")
154                    
155            if cur_para_id != line.GetParagraphID():
156                if cur_para_id != -1:
157                    print("</Para>")
158                cur_para_id= line.GetParagraphID()
159                print("<Para id=\"" +str(cur_para_id)+ "\">")
160                
161            bbox = line.GetBBox()
162            line_style = line.GetStyle()
163            sys.stdout.write("<Line box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
164            printStyle (line_style)
165            sys.stdout.write(" cur_num=\"" + str(line.GetCurrentNum()) + "\"" + ">\n")
166            
167            # For each word in the line...
168            word = line.GetFirstWord()
169            while word.IsValid():
170                # Output the bounding box for the word
171                bbox = word.GetBBox()
172                sys.stdout.write("<Word box=\"%.2f, %.2f, %.2f, %.2f\"" % ( bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()))
173                sys.stdout.write(" cur_num=\"" + str(word.GetCurrentNum()) + "\"");
174                sz = word.GetStringLen()
175                if sz == 0:
176                    word = word.GetNextWord()				
177                    continue
178                # If the word style is different from the parent style, output the new style.
179                s = word.GetStyle()
180                if s != line_style:
181                    printStyle (s);
182                wordString = word.GetString()
183                sys.stdout.write(">" + wordString + "</Word>\n")
184                word = word.GetNextWord()
185            sys.stdout.write("</Line>\n")                
186            line = line.GetNextLine()
187            
188        if cur_flow_id != -1:
189            if cur_para_id != -1:
190                cur_para_id = -1
191                sys.stdout.write("</Para>\n")
192            sys.stdout.write("</Flow>\n")
193        
194        txt.Destroy()
195        doc.Close()            
196        print("</PDFText>")
197    
198    # Sample code showing how to use low-level text extraction APIs.
199    if example5_low_level:
200        doc = PDFDoc(input_path)
201        doc.InitSecurityHandler()
202
203        # Example 1. Extract all text content from the document
204        
205        reader = ElementReader()
206        itr = doc.GetPageIterator()
207        while itr.HasNext():
208            reader.Begin(itr.Current())
209            dumpAllText(reader)
210            reader.End()
211            itr.Next()
212            
213        # Example 2. Extract text content based on the 
214        # selection rectangle.
215        
216        print("----------------------------------------------------")
217        print("Extract text based on the selection rectangle.")
218        print("----------------------------------------------------")
219        
220        itr = doc.GetPageIterator()
221        first_page = itr.Current()
222        s1 = ReadTextFromRect(first_page, Rect(27, 392, 563, 534), reader)
223        print("Field 1: " + s1)
224
225        s1 = ReadTextFromRect(first_page, Rect(28, 551, 106, 623), reader);
226        print("Field 2: " + s1)
227
228        s1 = ReadTextFromRect(first_page, Rect(208, 550, 387, 621), reader);
229        print("Field 3: " + s1)
230        
231        doc.Close()
232        
233        print("Done.")
234    PDFNet.Terminate()
235        
236if __name__ == '__main__':
237    main()

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5
6require '../../../PDFNetC/Lib/PDFNetRuby'
7include PDFNetRuby
8require '../../LicenseKey/RUBY/LicenseKey'
9
10$stdout.sync = true
11
12def PrintStyle (style)
13    sans_serif_str = ""
14    if style.IsSerif()
15		sans_serif_str = " sans-serif;"
16	end 
17    rgb = style.GetColor
18    rgb_hex =  "%02X%02X%02X;" % [rgb[0], rgb[1], rgb[2]]
19    font_str = '%g' % style.GetFontSize
20	print " style=\"font-family:" + style.GetFontName + "; font-size:" + font_str + ";" + sans_serif_str + " color:#" + rgb_hex + "\""
21end
22
23def DumpAllText (reader)
24	element = reader.Next
25	while !element.nil? do
26		case element.GetType
27		when Element::E_text_begin
28			puts "Text Block Begin"
29		when Element::E_text_end
30			puts "Text Block End"
31		when Element::E_text
32			bbox = element.GetBBox
33			puts "BBox: " + bbox.GetX1.to_s + ", " + bbox.GetY1.to_s + ", " +
34				bbox.GetX2.to_s + ", " + bbox.GetY2.to_s
35			puts element.GetTextString
36		when Element::E_text_new_line
37			puts "New Line"
38		when Element::E_form
39			reader.FormBegin
40			DumpAllText(reader)
41			reader.End
42		end
43		element = reader.Next
44	end
45end
46
47# A utility method used to extract all text content from
48# a given selection rectangle. The recnagle coordinates are
49# expressed in PDF user/page coordinate system.
50def ReadTextFromRect (page, pos, reader)
51	reader.Begin(page)
52	srch_str = RectTextSearch(reader, pos)
53	reader.End
54	return srch_str
55end
56
57#A helper method for ReadTextFromRect
58def RectTextSearch (reader, pos)
59	element = reader.Next
60	srch_str2 = ""
61	while !element.nil? do
62		case element.GetType
63		when Element::E_text
64			bbox = element.GetBBox
65			if bbox.IntersectRect(bbox, pos)
66				arr = element.GetTextString
67				srch_str2 += arr
68				srch_str2 += "\n"
69			end
70		when Element::E_text_new_line
71		when Element::E_form
72			reader.FormBegin
73			srch_str2 += RectTextSearch(reader, pos)
74			puts srch_str2
75			reader.End
76		end
77		element = reader.Next
78	end
79	return srch_str2
80end			
81	
82	PDFNet.Initialize(PDFTronLicense.Key)
83	
84	# Relative path to the folder containing test files.
85	input_path =  "../../TestFiles/newsletter.pdf"
86	example1_basic = false
87	example2_xml = false
88	example3_wordlist = false
89	example4_advanced = true
90	example5_low_level = false
91   
92	# Sample code showing how to use high-level text extraction APIs.
93	doc = PDFDoc.new(input_path)
94	doc.InitSecurityHandler
95	
96	page = doc.GetPage(1)
97	if page.nil?
98		print("page no found")
99	end
100		
101	txt = TextExtractor.new
102	txt.Begin(page) # Read the page
103	
104	# Example 1. Get all text on the page in a single string.
105	# Words will be separated witht space or new line characters.
106	if example1_basic
107		puts "Word count: " + txt.GetWordCount.to_s
108		puts "- GetAsText --------------------------" + txt.GetAsText
109		puts "-----------------------------------------------------------"
110	end
111   
112	# Example 2. Get XML logical structure for the page.
113	if example2_xml
114		text = txt.GetAsXML(TextExtractor::E_words_as_elements | 
115					TextExtractor::E_output_bbox | 
116					TextExtractor::E_output_style_info)	   
117		puts "- GetAsXML  --------------------------" + text
118		puts "-----------------------------------------------------------"
119	end
120		
121	
122	
123	# Example 3. Extract words one by one.
124	if example3_wordlist
125		word = Word.new
126		line = txt.GetFirstLine
127		while line.IsValid do
128			word = line.GetFirstWord
129			while word.IsValid do
130				puts word.GetString
131				word = word.GetNextWord
132			end
133			line = line.GetNextLine
134		end
135		puts "-----------------------------------------------------------"
136	end
137			
138
139	# Example 4. A more advanced text extraction example. 
140	# The output is XML structure containing paragraphs, lines, words, 
141	# as well as style and positioning information.
142	if example4_advanced
143		bbox = Rect.new
144		cur_flow_id = -1
145		cur_para_id = -1
146		
147		puts "<PDFText>"
148		# For each line on the page...
149		line = txt.GetFirstLine
150		while line.IsValid do
151			word_num = line.GetNumWords
152			if word_num == 0
153				line = line.GetNextLine			
154				next
155			end
156			word = line.GetFirstWord
157			if cur_flow_id != line.GetFlowID
158				if cur_flow_id != -1
159					if cur_para_id != -1
160						cur_para_id = -1
161						puts "</Para>"
162					end
163					puts "</Flow>"
164				end
165				cur_flow_id = line.GetFlowID
166				puts "<Flow id=\"" + cur_flow_id.to_s + "\">"
167			end
168					
169			if cur_para_id != line.GetParagraphID
170				if cur_para_id != -1
171					puts "</Para>"
172				end
173				cur_para_id= line.GetParagraphID
174				puts "<Para id=\"" + cur_para_id.to_s + "\">"
175			end
176				
177			bbox = line.GetBBox
178			line_style = line.GetStyle
179			print "<Line box=\"%.2f, %.2f, %.2f, %.2f\""% [bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()]
180			PrintStyle (line_style)
181			print " cur_num=\"" + "%d" % line.GetCurrentNum + "\"" + ">\n"
182			
183			# For each word in the line...
184			word = line.GetFirstWord
185			while word.IsValid do
186				# Output the bounding box for the word
187				bbox = word.GetBBox
188				print "<Word box=\"%.2f, %.2f, %.2f, %.2f\""% [bbox.GetX1(), bbox.GetY1(), bbox.GetX2(), bbox.GetY2()]
189				print " cur_num=\"" + "%d" % word.GetCurrentNum + "\"";
190				sz = word.GetStringLen
191				if sz == 0
192					word = word.GetNextWord				
193					next
194				end
195				# If the word style is different from the parent style, output the new style.
196				s = word.GetStyle
197				if s != line_style
198					PrintStyle (s)
199				end
200				print ">" + word.GetString + "</Word>\n"
201				word = word.GetNextWord
202			end
203			puts "</Line>"
204			line = line.GetNextLine
205		end
206			
207		if cur_flow_id != -1
208			if cur_para_id != -1
209				cur_para_id = -1
210				puts "</Para>"
211			end
212			puts "</Flow>"
213		end
214		
215		txt.Destroy
216		doc.Close			
217		puts "</PDFText>"
218	end
219
220	# Sample code showing how to use low-level text extraction APIs.
221	if example5_low_level
222		doc = PDFDoc.new(input_path)
223		doc.InitSecurityHandler
224
225		# Example 1. Extract all text content from the document
226		
227		reader = ElementReader.new
228		itr = doc.GetPageIterator
229		while itr.HasNext do
230			reader.Begin(itr.Current)
231			DumpAllText(reader)
232			reader.End
233			itr.Next
234		end
235			
236		# Example 2. Extract text content based on the 
237		# selection rectangle.
238		
239		puts "----------------------------------------------------"
240		puts "Extract text based on the selection rectangle."
241		puts "----------------------------------------------------"
242		
243		itr = doc.GetPageIterator
244		first_page = itr.Current
245		s1 = ReadTextFromRect(first_page, Rect.new(27, 392, 563, 534), reader)
246		puts "Field 1: " + s1
247
248		s1 = ReadTextFromRect(first_page, Rect.new(28, 551, 106, 623), reader);
249		puts "Field 2: " + s1
250
251		s1 = ReadTextFromRect(first_page, Rect.new(208, 550, 387, 621), reader);
252		puts "Field 3: " + s1
253		
254		doc.Close
255		puts "Done."
256	end
257	PDFNet.Terminate

1'
2' Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3'
4
5Imports System
6Imports System.Drawing
7Imports pdftron
8Imports pdftron.Common
9Imports pdftron.Filters
10Imports pdftron.SDF
11Imports PDFTRON.PDF
12
13' This sample illustrates various text extraction capabilities of PDFNet.
14
15Module TextExtractTestVB
16	Dim pdfNetLoader As PDFNetLoader
17	Sub New()
18		pdfNetLoader = pdftron.PDFNetLoader.Instance()
19	End Sub
20
21	Sub Main()
22
23		PDFNet.Initialize(PDFTronLicense.Key)
24
25		' Relative path to the folder containing test files.
26		Dim input_path As String = "../../../../TestFiles/"
27
28		Dim example1_basic As Boolean = False
29		Dim example2_xml As Boolean = False
30		Dim example3_wordlist As Boolean = False
31		Dim example4_advanced As Boolean = True
32		Dim example5_low_level As Boolean = False
33
34		' Sample code showing how to use high-level text extraction APIs.
35		Try
36			Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
37				doc.InitSecurityHandler()
38
39				Dim pg As Page = doc.GetPage(1)
40				If pg Is Nothing Then
41					Console.WriteLine("Page not found.")
42					Return
43				End If
44
45				Using txt As TextExtractor = New TextExtractor
46					txt.Begin(pg)	 ' Read the page.
47					' Other options you may want to consider...
48					' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_no_dup_remove)
49					' txt.Begin(page, Nothing, TextExtractor.ProcessingFlags.e_remove_hidden_text)
50					' ...
51
52					' Example 1. Get all text on the page in a single string.
53					' Words will be separated with space or new line characters.
54					If example1_basic Then
55						' Get the word count.
56						Console.WriteLine("Word Count: {0}", txt.GetWordCount())
57
58						Console.WriteLine("")
59						Console.WriteLine("- GetAsText --------------------------")
60						Console.WriteLine(txt.GetAsText())
61						Console.WriteLine("-----------------------------------------------------------")
62					End If
63
64
65					' Example 2. Get XML logical structure for the page.
66					If example2_xml Then
67						Console.WriteLine("")
68						Console.WriteLine("- GetAsXML  --------------------------")
69						Console.WriteLine(txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements Or TextExtractor.XMLOutputFlags.e_output_bbox Or TextExtractor.XMLOutputFlags.e_output_style_info))
70						Console.WriteLine("-----------------------------------------------------------")
71					End If
72
73
74					If example3_wordlist Then
75						Dim word As TextExtractor.Word
76						Dim line As TextExtractor.Line = txt.GetFirstLine()
77						While line.IsValid()
78							word = line.GetFirstWord()
79							While word.IsValid()
80								Console.WriteLine(word.GetString())
81								word = word.GetNextWord()
82							End While
83							line = line.GetNextLine()
84						End While
85						Console.WriteLine("-----------------------------------------------------------")
86					End If
87
88
89					' Example 3. A more advanced text extraction example. 
90					' The output is XML structure containing paragraphs, lines, words, 
91					' as well as style and positioning information.
92					If example4_advanced Then
93						Dim bbox As Rect
94						Dim cur_flow_id As Integer = -1
95						Dim cur_para_id As Integer = -1
96
97						Dim line As TextExtractor.Line
98						Dim word As TextExtractor.Word
99						Dim s As TextExtractor.Style
100						Dim line_style As TextExtractor.Style
101						Console.WriteLine("<PDFText>")
102						' For each line on the page...
103						line = txt.GetFirstLine()
104
105						While line.IsValid()
106							If Not cur_flow_id = line.GetFlowID() Then
107								If Not cur_flow_id = -1 Then
108									If Not cur_para_id = -1 Then
109										cur_para_id = -1
110										Console.WriteLine("</Para>")
111									End If
112									Console.WriteLine("</Flow>")
113								End If
114								cur_flow_id = line.GetFlowID()
115								Console.WriteLine("<Flow id=""{0}"">", cur_flow_id)
116							End If
117
118							If Not cur_para_id = line.GetParagraphID() Then
119								If Not cur_para_id = -1 Then
120									Console.WriteLine("</Para>")
121								End If
122								cur_para_id = line.GetParagraphID()
123								Console.WriteLine("<Para id=""{0}"">", cur_para_id)
124							End If
125
126							bbox = line.GetBBox()
127							line_style = line.GetStyle()
128							Console.Write("<Line box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
129							PrintStyle(line_style)
130							Console.Write(" cur_num=""" & line.GetCurrentNum() & """")
131							Console.WriteLine(">")
132
133							' For each word in the line...
134							word = line.GetFirstWord()
135							While word.IsValid()
136								' Output the bounding box for the word.
137								bbox = word.GetBBox()
138								Console.Write("<Word box=""{0}, {1}, {2}, {3}""", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"))
139								Console.Write(" cur_num=""" & word.GetCurrentNum() & """")
140								Dim sz As Integer = word.GetStringLen()
141								If (sz = 0) Then Continue While
142								' If the word style is different from the parent style, output the new style.
143								s = word.GetStyle()
144								If Not s.Equals(line_style) Then
145									PrintStyle(s)
146								End If
147
148								Console.Write(">")
149								Console.Write(word.GetString())
150								Console.WriteLine("</Word>")
151								word = word.GetNextWord()
152							End While
153
154							Console.WriteLine("</Line>")
155							line = line.GetNextLine()
156						End While
157
158						If Not cur_flow_id = -1 Then
159							If Not cur_para_id = -1 Then
160								cur_para_id = -1
161								Console.WriteLine("</Para>")
162							End If
163							Console.WriteLine("</Flow>")
164						End If
165					End If
166
167					Console.WriteLine("</PDFText>")
168				End Using
169			End Using
170		Catch ex As PDFNetException
171			Console.WriteLine(ex.Message)
172		Catch ex As Exception
173			MsgBox(ex.Message)
174		End Try
175
176
177
178		' Sample code showing how to use low-level text extraction APIs.
179		If (example5_low_level) Then
180
181			Try
182				' Open the test file
183				Using doc As PDFDoc = New PDFDoc(input_path + "newsletter.pdf")
184					doc.InitSecurityHandler()
185
186					Using reader As ElementReader = New ElementReader
187
188						' Example 1. Extract all text content from the document
189						Dim itr As PageIterator = doc.GetPageIterator()
190						' While itr.HasNext()
191						reader.Begin(itr.Current())
192						DumpAllText(reader)
193						reader.End()
194						'   itr.Next()
195						' End While
196
197						' Example 2. Extract text based on the selection rectangle.
198						Console.WriteLine("----------------------------------------------------")
199						Console.WriteLine("Extract text based on the selection rectangle.")
200						Console.WriteLine("----------------------------------------------------")
201
202						Dim first_page As Page = doc.GetPage(1)
203						Dim field1 As String = ReadTextFromRect(first_page, New Rect(27, 392, 563, 534), reader)
204						Dim field2 As String = ReadTextFromRect(first_page, New Rect(28, 551, 106, 623), reader)
205						Dim field3 As String = ReadTextFromRect(first_page, New Rect(208, 550, 387, 621), reader)
206
207						Console.WriteLine("Field 1: {0}", field1)
208						Console.WriteLine("Field 2: {0}", field2)
209						Console.WriteLine("Field 3: {0}", field3)
210						' ... 
211
212						Console.WriteLine("Done.")
213					End Using
214				End Using
215
216			Catch ex As PDFNetException
217				Console.WriteLine(ex.Message)
218			Catch ex As Exception
219				MsgBox(ex.Message)
220			End Try
221		End If
222		PDFNet.Terminate()
223	End Sub
224
225
226	Sub PrintStyle(ByRef s As TextExtractor.Style)
227		Dim RGB As Color = s.GetColor()
228		Dim rgb_hex As String = String.Format("{0:X02}{1:X02}{2:X02};", RGB.R, RGB.G, RGB.B)
229		Dim sans_serif_str As String = ""
230		If s.IsSerif() Then
231			sans_serif_str = " sans-serif;"
232		End If
233		Console.Write(" style=""font-family:{0}; font-size:{1};{2} color:#{3}""", s.GetFontName(), s.GetFontSize(), sans_serif_str, rgb_hex)
234	End Sub
235
236	' LowLevelTextExtractUtils ----------------------------------------
237
238	Sub DumpAllText(ByRef reader As ElementReader)
239		Dim element As Element = reader.Next()
240		While (Not IsNothing(element))		 ' Read page contents
241			Dim type As Element.Type = element.GetType()
242
243			If type = element.Type.e_text_begin Then
244				Console.WriteLine()
245				Console.WriteLine("--> Text Block Begin")
246			ElseIf type = element.Type.e_text_end Then
247				Console.WriteLine()
248				Console.WriteLine("--> Text Block End")
249			ElseIf type = element.Type.e_text Then
250				Dim bbox As Rect = New Rect
251				element.GetBBox(bbox)
252				' Console.WriteLine("\n--> BBox: {0}, {1}, {2}, {3}", bbox.x1, bbox.y1, bbox.x2, bbox.y2)
253
254				Dim txt As String = element.GetTextString()
255				Console.WriteLine(txt)
256			ElseIf type = element.Type.e_text_new_line Then
257				' Console.WriteLine()
258				' Console.WriteLine("--> New Line")
259			ElseIf type = element.Type.e_form Then
260				reader.FormBegin()				' Process form XObjects
261				DumpAllText(reader)
262				reader.End()
263			End If
264
265			element = reader.Next()
266		End While
267	End Sub
268
269	Private _srch_str As String
270
271	' A helper method for ReadTextFromRect
272	Sub RectTextSearch(ByRef reader As ElementReader, ByRef pos As Rect)
273		Dim element As Element = reader.Next()
274		While (Not IsNothing(element))		 ' Read page contents
275			Dim type As Element.Type = element.GetType()
276
277			If type = element.Type.e_text Then
278				Dim bbox As Rect = New Rect
279				element.GetBBox(bbox)
280
281				If (bbox.IntersectRect(bbox, pos)) Then
282					Dim txt As String = element.GetTextString()
283					_srch_str = _srch_str + txt
284				End If
285			ElseIf type = element.Type.e_text_new_line Then
286			ElseIf type = element.Type.e_form Then
287				reader.FormBegin()				   ' Process form XObjects
288				RectTextSearch(reader, pos)
289				reader.End()
290			End If
291
292			element = reader.Next()
293		End While
294	End Sub
295
296
297	' A utility method used to extract all text content from
298	' a given selection rectangle. The rectangle coordinates are
299	' expressed in PDF user/page coordinate system.
300	Function ReadTextFromRect(ByRef page As Page, ByRef pos As Rect, ByRef reader As ElementReader) As String
301		_srch_str = ""
302		reader.Begin(page)
303		RectTextSearch(reader, pos)
304		reader.End()
305		Return _srch_str
306	End Function
307
308End Module

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales

Product:

Extract Text, Read, Parse PDF - TextExtract - Python Sample Code