Some test text!

Discord Logo

Chat with us

PDFTron is now Apryse, learn more here.

PDF logical structure reader in Ruby

More languages

More languages
JavaScript
Java (Android)
C++
C#
C# (.NET Core)
Go
Java
Kotlin
Obj-C
JS (Node.js)
PHP
Python
Ruby
Swift
VB
C# (Xamarin)

Sample Ruby code for using PDFTron SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Ruby PDF Library and PDF Parsing & Content Extraction Library.

Get Started Samples Download

To run this sample, get started with a free trial of Apryse SDK.

#---------------------------------------------------------------------------------------
# Copyright (c) 2001-2022 by PDFTron Systems Inc. All Rights Reserved.
# Consult LICENSE.txt regarding license information.
#---------------------------------------------------------------------------------------

require '../../../PDFNetC/Lib/PDFNetRuby'
include PDFNetRuby
require '../../LicenseKey/RUBY/LicenseKey'

$stdout.sync = true

#---------------------------------------------------------------------------------------
# This sample explores the structure and content of a tagged PDF document and dumps 
# the structure information to the console window.
#
# In tagged PDF documents StructTree acts as a central repository for information 
# related to a PDF document's logical structure. The tree consists of StructElement-s
# and ContentItem-s which are leaf nodes of the structure tree.
#
# The sample can be extended to access and extract the marked-content elements such 
# as text and images.
#---------------------------------------------------------------------------------------

def PrintIndent(indent)
	print "\n"
	i = 0
	while i < indent
		print "  "
		i = i + 1
	end
end
		
def ProcessStructElement(element, indent)
	if !element.IsValid
		return
	end
	
	# Print out the type and title info, if any.
	PrintIndent(indent)
	indent = indent + 1
	print "Type: " + element.GetType
	if element.HasTitle
		print ". Title:" + element.GetTitle
	end
	
	num = element.GetNumKids
	i = 0
	while i < num do
		# Check if the kid is a leaf node (i.e. it is a ContentItem)
		if element.IsContentItem(i)
			cont = element.GetAsContentItem(i)
			type = cont.GetType
			
			page = cont.GetPage
			
			PrintIndent(indent)
			print "Content Item. Part of page #" + page.GetIndex.to_s
			PrintIndent(indent)
			case type
			when ContentItem::E_MCID
				print "MCID: " + cont.GetMCID.to_s
			when ContentItem::E_MCR
				print "MCID: " + cont.GetMCID.to_s
			when ContentItem::E_OBJR
				print "OBJR "
				ref_obj = cont.GetRefObj
				if !ref_obj.nil?
					print "- Referenced Object#: " + ref_obj.GetObjNum.to_s
				end
			end
		else
			ProcessStructElement(element.GetAsStructElem(i), indent)
		end
		i = i + 1
	end
end	

# Used in code snippet 3.
def ProcessElements2(reader)
	mcid_page_map = Hash.new
	element = reader.Next
	while !element.nil? do	# Read page contents
		# In this sample we process only text, but the code can be extended
		# to handle paths, images, or other Element type.
		mcid = element.GetStructMCID
		
		if mcid>=0 and element.GetType == Element::E_text
			val = element.GetTextString
			
			if mcid_page_map.has_key?(mcid)
				mcid_page_map[mcid] = mcid_page_map[mcid].to_s + val
			else
				mcid_page_map[mcid] = val
			end
		end
		element = reader.Next
	end
	return mcid_page_map
end

# Used in code snippet 2.
def ProcessElements(reader)
	element = reader.Next
	while !element.nil? do	# Read page contents
		# In this sample we process only paths & text, but the code can be 
		# extended to handle any element type.
		type = element.GetType
		if (type == Element::E_path or
			type == Element::E_text or
			type == Element::E_path)
			case type
			when Element::E_path	# Process path ...
				print "\nPATH: "
			when Element::E_text	# Process text ...
				print "\nTEXT: " + element.GetTextString + "\n"
			when Element::E_path	# Process from XObjects
				print "\nFORM XObject: "
			end
			
			# Check if the element is associated with any structural element.
			# Content items are leaf nodes of the structure tree.
			struct_parent = element.GetParentStructElement
			if struct_parent.IsValid
				# Print out the parent structural element's type, title, and object number.
				print " Type: " + struct_parent.GetType.to_s + ", MCID: " + element.GetStructMCID.to_s
				if struct_parent.HasTitle
					print ". Title: " + struct_parent.GetTitle
				end
				print ", Obj#: " + struct_parent.GetSDFObj.GetObjNum.to_s
			end
		end
		element = reader.Next
	end
end		
		
def ProcessStructElement2(element, mcid_doc_map, indent)
	if !element.IsValid
		return
	end
	
	# Print out the type and title info, if any
	PrintIndent(indent)
	print "<" + element.GetType
	if element.HasTitle
		print " title=\"" + element.GetTitle + "\""
	end
	print ">"
	
	num = element.GetNumKids
	i = 0
	while i < num do
		if element.IsContentItem(i)
			cont = element.GetAsContentItem(i)
			if cont.GetType == ContentItem::E_MCID
				page_num = cont.GetPage.GetIndex
				if mcid_doc_map.has_key?(page_num)
					mcid_page_map = mcid_doc_map[page_num]
					mcid_key = cont.GetMCID
					if mcid_page_map.has_key?(mcid_key)
						print mcid_page_map[mcid_key]
					end
				end
			end
		else	# the kid is another StructElement node.
			ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, indent+1)
		end 
		i = i + 1
	end
	PrintIndent(indent)
	print "</" + element.GetType + ">"		
end

	PDFNet.Initialize(PDFTronLicense.Key)
	
	# Relative path to the folder containing the test files.
	input_path = "../../TestFiles/"
	output_path = "../../TestFiles/Output/"
	
	# Extract logical structure from a PDF document
	doc = PDFDoc.new(input_path + "tagged.pdf")
	doc.InitSecurityHandler
	
	puts "____________________________________________________________"
	puts "Sample 1 - Traverse logical structure tree..."
	
	tree = doc.GetStructTree
	if tree.IsValid
		puts "Document has a StructTree root."
		
		i = 0
		while i<tree.GetNumKids do
			# Recursively get structure info for all child elements.
			ProcessStructElement(tree.GetKid(i), 0)
			i = i + 1
		end
	else
		puts "This document does not contain any logical structure."
	end
	
	puts "\nDone 1."
	
	puts "____________________________________________________________"
	puts "Sample 2 - Get parent logical structure elements from"
	puts "layout elements."
	
	reader = ElementReader.new
	itr = doc.GetPageIterator
	while itr.HasNext do
		reader.Begin(itr.Current)
		ProcessElements(reader)
		reader.End
		itr.Next
	end
	
	puts "\nDone 2."
	
	puts "____________________________________________________________"
	puts "Sample 3 - 'XML style' extraction of PDF logical structure and page content."

	# A map which maps page numbers(as Integers)
	# to page Maps(which map from struct mcid(as Integers) to
	# text Strings)

	mcid_doc_map = Hash.new
	reader = ElementReader.new
	itr = doc.GetPageIterator
	while itr.HasNext do
		reader.Begin(itr.Current)
		mcid_doc_map[itr.Current.GetIndex] = ProcessElements2(reader)
		reader.End
		itr.Next
	end
	tree = doc.GetStructTree
	if tree.IsValid
		i = 0
		while i < tree.GetNumKids do
			ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0)
			i = i + 1  
		end
	end
	puts "\nDone 3."
	doc.Save((output_path + "LogicalStructure.pdf"), SDFDoc::E_linearized)
	doc.Close
	PDFNet.Terminate