Convert to PDF/UA - Python Sample Code

Sample code for using Apryse SDK to programmatically convert generic PDF documents into ISO-compliant, VeraPDF-valid PDF/UA files. Supports PDF/UA-1. Sample code provided in Python, C++, C#, Java, Node.js (JavaScript), PHP, Ruby and VB.

Learn more about our Server SDK and PDF/UA Library.

1#---------------------------------------------------------------------------------------
2# Copyright (c) 2001-2024 by Apryse Software Inc. All Rights Reserved.
3# Consult LICENSE.txt regarding license information.
4#---------------------------------------------------------------------------------------
5import site
6site.addsitedir("../../../PDFNetC/Lib")
7import sys
8from PDFNetPython import *
9sys.path.append("../../LicenseKey/PYTHON")
10from LicenseKey import *
11#---------------------------------------------------------------------------------------
12# The following sample illustrates how to make sure a file meets the PDF/UA standard, using the PDFUAConformance class object.
13# Note: this feature is currently experimental and subject to change
14#
15# DataExtractionModule is required (Mac users can use StructuredOutputModule instead)
16# https://docs.apryse.com/documentation/core/info/modules/#data-extraction-module
17# https://docs.apryse.com/documentation/core/info/modules/#structured-output-module (Mac)
18#---------------------------------------------------------------------------------------
19# Relative path to the folder containing the test files.
20input_path = "../../TestFiles/"
21output_path = "../../TestFiles/Output/"
22# DataExtraction library location, replace if desired, should point to a folder that includes the contents of <DataExtractionModuleRoot>/Lib.
23# If using default, unzip the DataExtraction zip to the parent folder of Samples, and merge with existing "Lib" folder.
24extraction_module_path = "../../../PDFNetC/Lib/"
25def main():
26 input_file1 = input_path + "autotag_input.pdf"
27 input_file2 = input_path + "table.pdf"
28 output_file1 = output_path + "autotag_pdfua.pdf"
29 output_file2 = output_path + "table_pdfua_linearized.pdf"
30 PDFNet.Initialize(LicenseKey)
31 print("AutoConverting...")
32 PDFNet.AddResourceSearchPath(extraction_module_path)
33 if not DataExtractionModule.IsModuleAvailable(DataExtractionModule.e_DocStructure):
34 print("")
35 print("Unable to run Data Extraction: PDFTron SDK Structured Output module not available.")
36 print("-----------------------------------------------------------------------------")
37 print("The Data Extraction suite is an optional add-on, available for download")
38 print("at https://docs.apryse.com/documentation/core/info/modules/. If you have already")
39 print("downloaded this module, ensure that the SDK is able to find the required files")
40 print("using the PDFNet.AddResourceSearchPath() function.")
41 print("")
42 PDFNet.Terminate()
43 return
44 try:
45 pdf_ua = PDFUAConformance()
46 print("Simple Conversion...")
47 # Perform conversion using default options
48 pdf_ua.AutoConvert(input_file1, output_file1)
49 print("Converting With Options...")
50 pdf_ua_opts = PDFUAOptions()
51 pdf_ua_opts.SetSaveLinearized(True) # Linearize when saving output
52 # Note: if file is password protected, you can use pdf_ua_opts.SetPassword()
53 # Perform conversion using the options we specify
54 pdf_ua.AutoConvert(input_file2, output_file2, pdf_ua_opts)
55 except Exception as e:
56 print(str(e))
57 PDFNet.Terminate()
58 print("PDFUAConformance test completed.")
59if __name__ == '__main__':
60 main()

Did you find this helpful?

Trial setup questions?

Ask experts on Discord

Need other help?

Contact Support

Pricing or product questions?

Contact Sales