Python API

PDF

class RPA.PDF.PDF(outdir: str = '.')

Bases: fpdf.fpdf.FPDF, fpdf.html.HTMLMixin

PDF is a library for managing PDF documents.

It provides an easy method of generating a PDF document from an HTML formatted template file.

Examples

Robot Framework

*** Settings ***
Library    RPA.PDF

*** Variables ***
${TEMPLATE}    order.template
${PDF}         result.pdf
&{VARS}        name=Robot Generated
...            email=robot@domain.com
...            zip=00100
...            items=Item 1, Item 2

*** Tasks ***
Create PDF from HTML template
    Template HTML to PDF   ${TEMPLATE}  ${PDF}  ${VARS}

Python

from RPA.PDF import PDF

p = PDF()
orders = ["item 1", "item 2", "item 3"]
vars = {
    "name": "Robot Process",
    "email": "robot@domain.com",
    "zip": "00100",
    "items": "<br/>".join(orders),
}
p.template_html_to_pdf("order.template", "order.pdf", vars)
ROBOT_LIBRARY_DOC_FORMAT = 'REST'
ROBOT_LIBRARY_SCOPE = 'GLOBAL'
accept_page_break()

Accept automatic page break or not

active_fileobject: object = None
add_font(family, style='', fname='', uni=False)

Add a TrueType or Type1 font

add_image_to_pdf(imagefile, source=None, target=None, coverage=0.2)

Add image to PDF which can be new or existing PDF.

Parameters
  • imagefile – filepath to image file to add into PDF

  • source – filepath to source, if not given add image to currently active PDF

  • target – filepath of target PDF

  • coverage – [description], defaults to 0.2

Raises

ValueError – [description]

Result will be always written to target_pdf so that needs to be given for the keyword.

Create a new internal link

add_page(orientation='')

Start a new page

add_pages(pages: int = 1) → None

Adds pages into PDF documents.

Parameters

pages – number of pages to add, defaults to 1

add_pages_to_document(pages: int = 1, source_pdf: str = None, target_pdf: str = None) → None

Add empty pages into current source document

Parameters
  • pages – number of pages to add, defaults to 1

  • source_pdf – filepath to the source pdf

  • target_pdf – filename to the target pdf, stored by default to output_directory

alias_nb_pages(alias='{nb}')

Define an alias for total number of pages

anchor_element: dict = None
cell(w, h=0, txt='', border=0, ln=0, align='', fill=0, link='')

Output a cell

check_page()

Decorator to protect drawing methods

close()

Terminate document

close_all_pdf_documents() → None

Close all opened PDF file descriptors.

close_pdf_document(source_pdf: str = None)

Close PDF file descriptor for certain file.

Parameters

source_pdf – filepath

Raises

ValueError – if file descriptor for the file is not found

code39(txt, x, y, w=1.5, h=5.0)

Barcode 3of9

dashed_line(x1, y1, x2, y2, dash_length=1, space_length=1)

Draw a dashed line. Same interface as line() except: - dash_length: Length of the dash - space_length: Length of the space between dashes

dump_pdf_as_xml(source_pdf: str = None)

Get PDFMiner format XML dump of the PDF

Parameters

source_pdf – filepath

Returns

XML content

ellipse(x, y, w, h, style='')

Draw a ellipse

error(msg)

Fatal error

extract_pages_from_pdf(source_pdf: str = None, target_pdf: str = None, pages: Any = None) → None

Extract pages from source PDF and save to target PDF document.

Parameters
  • source_pdf – filepath to the source pdf

  • target_pdf – filename to the target pdf, stored by default to output_directory

  • pages – page numbers to extract from PDF (numbers start from 0) if None then extracts all pages

Page numbers starting from 1.

fileobjects: dict = None
footer()

Footer to be implemented in your own inherited class

get_all_figures() → dict

Return all figures in the PDF document.

Returns

dictionary of figures divided into pages

PDF needs to be parsed before elements can be found.

get_info(source_pdf: str = None) → dict

Get information from PDF document.

Parameters

source_pdf – filepath to the source pdf

Returns

dictionary of PDF information

get_input_fields(source_pdf: str = None, replace_none_value: bool = False) → dict

Get input fields in the PDF.

Parameters
  • source_pdf – source filepath, defaults to None

  • replace_none_value – if value is None replace it with key name, defaults to False

Returns

dictionary of input key values or None

Stores input fields internally so that they can be used without parsing PDF again.

Parameter replace_none_value is for convience to visualize fields.

get_number_of_pages(source_pdf: str = None) → int

Get number of pages in the document.

Parameters

source_pdf – filepath to the source pdf

Raises

PdfReadError – if file is encrypted or other restrictions are in place

get_output_directory() → str

Get output directory where target files are saved to.

Returns

absolute filepath as string

get_string_width(s)

Get width of a string in the current font

get_text_from_pdf(source_pdf: str = None, pages: Any = None) → dict

Get text from set of pages in source PDF document.

Parameters
  • source_pdf – filepath to the source pdf

  • pages – page numbers to get text (numbers start from 0)

Returns

dictionary of pages and their texts

PDF needs to be parsed before text can be read.

get_value_from_anchor(locator: str, pagenum: int = 1, direction: str = 'right', strict: bool = False, regexp: str = None) → str

Get closest text (value) to anchor element.

PDF needs to be parsed before elements can be found.

Parameters
  • locator – element to set anchor to

  • pagenum – page number where search if performed on, default 1 (first)

  • direction – in which direction to search for text, directions ‘top’, ‘bottom’, ‘left’ or ‘right’, defaults to ‘right’

  • strict – if element margins should be used for matching points, used when direction is ‘top’ or ‘bottom’, default False

  • regexp – expected format of value to match, defaults to None

Returns

closest matching text or None

get_x()

Get x position

get_y()

Get y position

header()

Header to be implemented in your own inherited class

html_to_pdf(content: str = None, filename: str = None, variables: dict = None, create_dirs: bool = True, exists_ok: bool = True) → None

Use HTML content to generate PDF file.

Parameters
  • content – HTML content

  • filename – filepath where to save PDF document

  • variables – dictionary of variables to fill into template, defaults to {}

  • create_dirs – directory structure is created if it is missing, default True

  • exists_ok – file is overwritten if it exists, default True

image(name, x=None, y=None, w=0, h=0, type='', link='')

Put an image on the page

interleaved2of5(txt, x, y, w=1.0, h=10.0)

Barcode I2of5 (numeric), adds a 0 if odd lenght

is_pdf_encrypted(source_pdf: str = None) → bool

Check if PDF is encrypted.

Returns True even if PDF was decrypted.

Parameters

source_pdf – filepath to the source pdf

Returns

True if file is encrypted

line(x1, y1, x2, y2)

Draw a line

Put a link on the page

ln(h='')

Line Feed; default value is last cell height

modified_reader: PdfFileReader = None
multi_cell(w, h, txt='', border=0, align='J', fill=0, split_only=False)

Output text with automatic or explicit line breaks

normalize_text(txt)

Check that text input is in the correct format/encoding

open()

Begin document

open_pdf_document(source_pdf: str = None) → None

Open PDF document.

Parameters

source_pdf – filepath to the source pdf

Raises

ValueError – if PDF is already open

Also opens file for reading.

output(name='', dest='')

Output PDF to some destination

output_directory: Path = None
page_no()

Get current page number

page_rotate(pages: int, source_pdf: str = None, target_pdf: str = None, clockwise: bool = True, angle: int = 90) → None

Rotate pages in source PDF document and save to target PDF document.

Parameters
  • source_pdf – filepath to the source pdf

  • target_pdf – filename to the target pdf, stored by default to output_directory

  • pages – page numbers to extract from PDF (numbers start from 0)

  • clockwise – directorion that page will be rotated to, default True

  • angle – number of degrees to rotate, default 90

parse_pdf(source_pdf: str = None) → None

Parse source PDF into entities which can be used for text searches for example.

Parameters

source_pdf – source

pdf_decrypt(source_pdf: str = None, target_pdf: str = None, password: str = None) → bool

Decrypt PDF with password.

Parameters
  • source_pdf – filepath to the source pdf

  • target_pdf – filepath to the decrypted pdf

  • password – password as a string

Returns

True if decrypt was successful, else False or Exception

Raises

ValueError – on decryption errors

pdf_encrypt(source_pdf: str = None, target_pdf: str = None, user_pwd: str = '', owner_pwd: str = None, use_128bit: bool = True) → None

Encrypt PDF document.

Parameters
  • source_pdf – filepath to the source pdf

  • target_pdf – filename to the target pdf, stored by default to output_directory

  • user_pwd – allows opening and reading PDF with restrictions

  • owner_pwd – allows opening PDF without any restrictions, by default same user_pwd

  • use_128bit – whether to 128bit encryption, when false 40bit encryption is used, default True

rect(x, y, w, h, style='')

Draw a rectangle

replace_text(text: str, replace: str)

Replace text content with something else in the PDF.

Parameters
  • text – this text will be replaced

  • replace – used to replace text

rotate(angle, x=None, y=None)
rpa_pdf_document: RpaPdfDocument = None
save_pdf(source: str = None, target: str = None, use_modified_reader: bool = False)

Save current over itself or to target_pdf

Parameters
  • source – filepath to source PDF

  • target – filepath to target PDF

  • use_modified_reader – needs to be set to True if using modified PDF reader

set_anchor_to_element(locator: str) → bool

Sets anchor point in the document for further searches.

PDF needs to be parsed before elements can be found.

Parameters

locator – element to search for

Returns

True if element was found

set_author(author)

Author of document

set_auto_page_break(auto, margin=0)

Set auto page break mode and triggering margin

set_compression(compress)

Set page compression

set_creator(creator)

Creator of document

set_display_mode(zoom, layout='continuous')

Set display mode in viewer

The “zoom” argument may be ‘fullpage’, ‘fullwidth’, ‘real’, ‘default’, or a number, interpreted as a percentage.

set_draw_color(r, g=-1, b=-1)

Set color for all stroking operations

set_field_value(field_name: str, value: Any, save: bool = False)

Set value for field with given name.

Parameters
  • field_name – field to update

  • value – new value for the field

Tries to match on field identifier and its label.

Exception is thrown if field can’t be found or more than 1 field matches the given field_name.

set_fill_color(r, g=-1, b=-1)

Set color for all filling operations

set_font(family, style='', size=0)

Select a font; size given in points

set_font_size(size)

Set font size in points

set_keywords(keywords)

Keywords of document

set_left_margin(margin)

Set left margin

set_line_width(width)

Set line width

Set destination of internal link

set_margins(left, top, right=-1)

Set left, top and right margins

set_output_directory(outdir: str = '.') → None

Set output directory where target files are saved to.

Parameters

outdir – output directory path, default to current directory

set_right_margin(margin)

Set right margin

set_subject(subject)

Subject of document

set_text_color(r, g=-1, b=-1)

Set color for text

set_title(title)

Title of document

set_top_margin(margin)

Set top margin

set_x(x)

Set x position

set_xy(x, y)

Set x and y positions

set_y(y)

Set y position and reset x

switch_to_pdf_document(source_pdf: str = None) → None

Switch library’s current fileobject to already open file or open file if not opened.

Parameters

source_pdf – filepath

Raises

ValueError – if PDF filepath is not given and there are no active file to activate

template_html_to_pdf(template: str = None, filename: str = None, variables: dict = None, create_dirs: bool = True, exists_ok: bool = True) → None

Use HTML template file to generate PDF file.

Parameters
  • template – filepath to HTML template

  • filename – filepath where to save PDF document

  • variables – dictionary of variables to fill into template, defaults to {}

  • create_dirs – directory structure is created if it is missing, default True

  • exists_ok – file is overwritten if it exists, default True

text(x, y, txt='')

Output a string

update_field_values(source_pdf: str = None, target_pdf: str = None, newvals: dict = None) → None

Update field values in PDF if it has fields.

Parameters
  • source_pdf – source PDF with fields to update

  • target_pdf – updated target PDF

  • newvals – dictionary with key values to update

write(h, txt='', link='')

Output text in flowing mode

write_html(text, image_map=None)

Parse HTML and convert it to PDF

class RPA.PDF.PageGenerator(gen)

Bases: object

Supporting generator class for Pages

class RPA.PDF.RPAConverter(rsrcmgr, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False)

Bases: pdfminer.converter.PDFConverter

Class for converting PDF into RPA classes

CONTROL = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]')
begin_figure(name, bbox, matrix)
begin_page(page, ctm)
begin_tag(tag, props=None)
close()
do_tag(tag, props=None)
end_figure(_)
end_page(page)
end_tag()
handle_undefined_char(font, cid)
paint_path(gstate, stroke, fill, evenodd, path)

Paint paths described in section 4.4 of the PDF reference manual

receive_layout(ltpage)
render_char(matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate)
render_image(name, stream)
render_string(textstate, seq, ncs, graphicstate)
render_string_horizontal(seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
render_string_vertical(seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
set_ctm(ctm)
write(text)
write_header()
write_text(text)
class RPA.PDF.RpaFigure(name: str, bbox: Iterable)

Bases: object

Class for each LTFigure element in the PDF

details()
figure_bbox: list = None
figure_name: str = None
image_name: str = None
item: dict = None
set_item(item: Any)
class RPA.PDF.RpaPdfDocument

Bases: object

Class for parsed PDF document

add_page(page: RPA.PDF.RpaPdfPage) → None
append_xml(xml: bytes) → None
dump_xml() → str
encoding: str = 'utf-8'
get_page(pagenum: int) → RPA.PDF.RpaPdfPage
get_pages() → collections.OrderedDict
pages: OrderedDict = None
xml_content: bytearray = bytearray(b'')
class RPA.PDF.RpaPdfPage(pageid: int, bbox: Iterable, rotate: int)

Bases: object

Class for each PDF page

add_content(content: Any) → None
bbox: list = None
content: OrderedDict = None
content_id: int = None
get_content() → collections.OrderedDict
get_figures() → collections.OrderedDict
get_textboxes() → collections.OrderedDict
pageid: str = None
rotate: int = None
class RPA.PDF.RpaTextBox(boxid: int, bbox: Iterable, wmode: str)

Bases: object

Class for each LTTextBox element in the PDF

property bbox
property bottom
property boxid
item: dict = None
property left
property right
set_item(item: Any)
property text
textbox_bbox: list = None
textbox_id: int = None
textbox_wmode: str = None
property top
RPA.PDF.iterable_items_to_int(bbox)