More languages
Some test text!
More languages
Sample Obj-C code for using PDFTron SDK to explore the logical structure and content of a tagged PDF file, then dumps the information to the console window. In tagged PDF files, StructTree acts as a central repository for information related to a PDF document's logical structure. The tree consists of StructElement-s and ContentItem-s which are leaf nodes of the structure tree. Learn more about our Obj-C PDF Library and PDF Parsing & Content Extraction Library.
Get Started Samples DownloadTo run this sample, get started with a free trial of Apryse SDK.
//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------
#import <OBJC/PDFNetOBJC.h>
#import <Foundation/Foundation.h>
//---------------------------------------------------------------------------------------
// This sample explores the structure and content of a tagged PDF document and dumps
// the structure information to the console window.
//
// In tagged PDF documents StructTree acts as a central repository for information
// related to a PDF document's logical structure. The tree consists of StructElement-s
// and ContentItem-s which are leaf nodes of the structure tree.
//
// The sample can be extended to access and extract the marked-content elements such
// as text and images.
//---------------------------------------------------------------------------------------
NSString *PrintIdent(int ident) {
int i;
NSString * str= @"\n";
for (i=0; i<ident; ++i) {
str = [str stringByAppendingString: @" "];
}
return str;
}
// Used in code snippet 1.
NSString* ProcessStructElement(PTSElement *element, int ident)
{
if (![element IsValid]) {
return @"";
}
NSString *result = @"";
// Print out the type and title info, if any.
result = [result stringByAppendingFormat: @"%@Type: %@", PrintIdent(ident++), [element GetType]];
if ([element HasTitle]) {
result = [result stringByAppendingFormat: @". Title: %@", [element GetTitle]];
}
int num = [element GetNumKids];
int i;
for (i=0; i<num; ++i)
{
// Check is the kid is a leaf node (i.e. it is a ContentItem).
if ([element IsContentItem: i]) {
PTContentItem *cont = [element GetAsContentItem: i];
PTContentItemType type = [cont GetType];
PTPage *page = [cont GetPage];
result = [result stringByAppendingFormat: @"%@Content Item. Part of page #%d%@", PrintIdent(ident), [page GetIndex], PrintIdent(ident)];
switch (type) {
case e_ptMCID:
case e_ptMCR:
result = [result stringByAppendingFormat: @"MCID: %d", [cont GetMCID]];
break;
case e_ptOBJR:
{
result = [result stringByAppendingString: @"OBJR "];
PTObj *ref_obj;
if ((ref_obj = [cont GetRefObj]) != NULL)
result = [result stringByAppendingFormat: @"- Referenced Object#: %u", [ref_obj GetObjNum]];
}
break;
default:
break;
}
}
else { // the kid is another StructElement node.
result = [result stringByAppendingString: ProcessStructElement([element GetAsStructElem: i], ident)];
}
}
return result;
}
// Used in code snippet 2.
NSString* ProcessLogicalStructureTestElements(PTElementReader *reader)
{
PTElement *element;
NSString *result = @"";
while ((element = [reader Next]) != NULL) // Read page contents
{
// In this sample we process only paths & text, but the code can be
// extended to handle any element type.
PTElementType type = [element GetType];
if (type == e_ptpath || type == e_pttext_obj || type == e_ptpath)
{
switch (type) {
case e_ptpath: // Process path ...
result = [result stringByAppendingString: @"\nPATH: "];
break;
case e_pttext_obj: // Process text ...
result = [result stringByAppendingFormat: @"\nTEXT: %@\n", [element GetTextString]];
break;
case e_ptform: // Process form XObjects
result = [result stringByAppendingString: @"\nFORM XObject: "];
//reader.FormBegin();
//ProcessLogicalStructureTestElements(reader);
//reader.End();
break;
default:
break;
}
// Check if the element is associated with any structural element.
// Content items are leaf nodes of the structure tree.
PTSElement *struct_parent = [element GetParentStructElement];
if ([struct_parent IsValid]) {
// Print out the parent structural element's type, title, and object number.
result = [result stringByAppendingFormat: @" Type: %@, MCID: %d", [struct_parent GetType], [element GetStructMCID]];
if ([struct_parent HasTitle]) {
result = [result stringByAppendingFormat: @". Title: %@", [struct_parent GetTitle]];
}
result = [result stringByAppendingFormat: @", Obj#: %u", [[struct_parent GetSDFObj] GetObjNum]];
}
}
}
return result;
}
// Used in code snippet 3.
//typedef map<int, string> MCIDPageMap;
NSMutableDictionary *MCIDPageMap;
NSMutableDictionary *MCIDDocMap;
//typedef map<int, MCIDPageMap> MCIDDocMap;
// Used in code snippet 3.
void ProcessLogicalStructureTestElements2(PTElementReader *reader, NSMutableDictionary *mcid_page_map)
{
PTElement *element;
while ((element = [reader Next]) != NULL) // Read page contents
{
// In this sample we process only text, but the code can be extended
// to handle paths, images, or any other Element type.
int mcid = [element GetStructMCID];
if (mcid>= 0 && [element GetType] == e_pttext_obj) {
NSString *val = [element GetTextString];
id key = @(mcid);
BOOL exist = [mcid_page_map.allKeys containsObject: key];
if (exist) {
NSString *str = mcid_page_map[key];
mcid_page_map[key] = [str stringByAppendingString: val];
}
else {
mcid_page_map[key] = val;
}
}
}
}
// Used in code snippet 3.
NSString* ProcessStructElement2(PTSElement *element, NSMutableDictionary *mcid_doc_map, int ident)
{
if (![element IsValid]) {
return @"";
}
NSString *result = @"";
// Print out the type and title info, if any.
result = [result stringByAppendingString: PrintIdent(ident)];
result = [result stringByAppendingFormat: @"<%@", [element GetType]];
if ([element HasTitle]) {
result = [result stringByAppendingFormat: @" title=\"%@\"", [element GetTitle]];
}
result = [result stringByAppendingString: @">"];
int num = [element GetNumKids];
int i;
for (i=0; i<num; ++i)
{
if ([element IsContentItem: i]) {
PTContentItem *cont = [element GetAsContentItem: i];
if ([cont GetType] == e_ptMCID) {
int page_num = [[cont GetPage] GetIndex];
id key = @(page_num);
BOOL exist = [mcid_doc_map.allKeys containsObject: key];
if (exist) {
NSMutableDictionary *mcid_page_map = mcid_doc_map[key];
id key2 = @([cont GetMCID]);
BOOL exist2 = [mcid_page_map.allKeys containsObject: key2];
if (exist2) {
NSString *str = mcid_page_map[key2];
result = [result stringByAppendingString: str];
}
}
}
}
else { // the kid is another StructElement node.
result = [result stringByAppendingString: ProcessStructElement2([element GetAsStructElem :i], mcid_doc_map, ident+1)];
}
}
result = [result stringByAppendingString: PrintIdent(ident)];
result = [result stringByAppendingFormat: @"</%@>", [element GetType]];
return result;
}
int main(int argc, char *argv[])
{
@autoreleasepool {
int ret = 0;
[PTPDFNet Initialize: 0];
@try // Extract logical structure from a PDF document
{
PTPDFDoc *doc = [[PTPDFDoc alloc] initWithFilepath: @"../../TestFiles/tagged.pdf"];
[doc InitSecurityHandler];
NSLog(@"____________________________________________________________");
NSLog(@"Sample 1 - Traverse logical structure tree...");
{
PTSTree *tree = [doc GetStructTree];
if ([tree IsValid]) {
NSLog(@"Document has a StructTree root.");
int i;
for (i=0; i<[tree GetNumKids]; ++i) {
// Recursively get structure info for all child elements.
NSLog(@"%@", ProcessStructElement([tree GetKid: i], 0));
}
}
else {
NSLog(@"This document does not contain any logical structure.");
}
}
NSLog(@"Done 1.");
NSLog(@"____________________________________________________________");
NSLog(@"Sample 2 - Get parent logical structure elements from");
NSLog(@"layout elements.");
{
PTElementReader *reader = [[PTElementReader alloc] init];
PTPageIterator *itr;
for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
[reader Begin: [itr Current]];
NSLog(@"%@", ProcessLogicalStructureTestElements(reader));
[reader End];
}
}
NSLog(@"Done 2.");
NSLog(@"____________________________________________________________");
NSLog(@"Sample 3 - 'XML style' extraction of PDF logical structure and page content.");
{
NSMutableDictionary *mcid_doc_map = [[NSMutableDictionary alloc] init];
PTElementReader *reader = [[PTElementReader alloc] init];
PTPageIterator *itr;
for (itr = [doc GetPageIterator: 1]; [itr HasNext]; [itr Next]) {
[reader Begin: [itr Current]];
NSMutableDictionary *arr = [[NSMutableDictionary alloc] init];
id key = @([[itr Current] GetIndex]);
mcid_doc_map[key] = arr;
ProcessLogicalStructureTestElements2(reader, mcid_doc_map[key]);
[reader End];
}
PTSTree *tree = [doc GetStructTree];
if ([tree IsValid]) {
int i;
for (i=0; i<[tree GetNumKids]; ++i) {
NSLog(@"%@", ProcessStructElement2([tree GetKid: i], mcid_doc_map, 0));
}
}
}
NSLog(@"Done 3.");
[doc SaveToFile: @"../../TestFiles/Output/LogicalStructure.pdf" flags: e_ptlinearized];
}
@catch(NSException *e)
{
NSLog(@"%@", e.reason);
ret = 1;
}
[PTPDFNet Terminate: 0];
return ret;
}
}