Some test text!

Discord Logo

Chat with us

PDFTron is now Apryse, learn more here.

Read elements across all PDF pages in Ruby

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
C# (UWP)
VB
C# (Xamarin)

Sample Ruby code for using PDFTron SDK to traverse the page display list using ElementReader. Learn more about our Ruby PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

#---------------------------------------------------------------------------------------
# Copyright (c) 2001-2022 by PDFTron Systems Inc. All Rights Reserved.
# Consult LICENSE.txt regarding license information.
#---------------------------------------------------------------------------------------

require '../../../PDFNetC/Lib/PDFNetRuby'
include PDFNetRuby
require '../../LicenseKey/RUBY/LicenseKey'

$stdout.sync = true

# Relative path to the folder containing the test files.
input_path = "../../TestFiles/"

def ProcessElements(reader)
	element = reader.Next()
	while !element.nil? do	# Read page contents
		if element.GetType() == Element::E_path	# Process path data...
			data = element.GetPathData()
			points = data.GetPoints()
		elsif element.GetType() == Element::E_text	# Process text strings...
			data = element.GetTextString()
			puts data
		elsif element.GetType() == Element::E_form	# Process form XObjects
			reader.FormBegin()
			ProcessElements(reader)
			reader.End()
		end
		element = reader.Next()
	end
end

	PDFNet.Initialize(PDFTronLicense.Key)
	
	# Extract text data from all pages in the document
	puts "-------------------------------------------------"
	puts "Sample 1 - Extract text data from all pages in the document."
	puts "Opening the input pdf..."
	
	doc = PDFDoc.new(input_path + "newsletter.pdf")
	doc.InitSecurityHandler()
	
	page_reader = ElementReader.new()
	
	itr = doc.GetPageIterator()
	
	# Read every page
	while itr.HasNext() do
		page_reader.Begin(itr.Current())
		ProcessElements(page_reader)
		page_reader.End()
		itr.Next()
	end
	
	# Close the open document to free up document memory sooner.	
	doc.Close()
	PDFNet.Terminate
	puts "Done."