Source code for madam.pdf

"""
PDF processor using pypdf (metadata) and pdf2image (rasterization).

The optional ``pdf`` dependency group must be installed::

    uv sync --extra pdf
"""

from __future__ import annotations

import io
from collections.abc import Mapping
from typing import IO, Any

from madam.core import Asset, OperatorError, Processor, operator

_MIME_TYPE_TO_PIL_FORMAT: dict[str, str] = {
    'image/jpeg': 'JPEG',
    'image/png': 'PNG',
}



[docs]
class PDFProcessor(Processor):
    """
    Represents a processor that handles *Portable Document Format* (PDF) files.

    Reading requires `pypdf <https://pypdf.readthedocs.io/>`_.  Rasterization
    additionally requires `pdf2image <https://github.com/Belval/pdf2image>`_
    and a system-wide installation of *poppler*.
    """


[docs]
    def __init__(self, config: Mapping[str, Any] | None = None) -> None:
        """
        Initializes a new ``PDFProcessor``.

        :param config: Mapping with settings.
        """
        super().__init__(config)



[docs]
    def can_read(self, file: IO) -> bool:
        header = file.read(4)
        file.seek(0)
        return header == b'%PDF'



[docs]
    def read(self, file: IO) -> Asset:
        """
        Reads a PDF file and returns an :class:`~madam.core.Asset`.

        The returned asset carries a ``page_count`` metadata attribute with the
        number of pages in the document.

        :param file: Readable binary file-like object containing PDF data
        :type file: IO
        :return: Asset with ``mime_type='application/pdf'`` and ``page_count``
        :rtype: Asset
        """
        try:
            import pypdf
        except ImportError as e:
            raise OperatorError('pypdf is required for reading PDFs; install the pdf extra') from e
        pdf_bytes = file.read()
        reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
        page_count = len(reader.pages)
        return Asset._from_bytes(pdf_bytes, mime_type='application/pdf', page_count=page_count)



[docs]
    @operator
    def rasterize(self, asset: Asset, page: int = 0, dpi: int = 72, mime_type: str = 'image/jpeg') -> Asset:
        """
        Converts one page of a PDF asset into a raster image.

        Requires the ``pdf2image`` package and a system *poppler* installation.

        :param asset: PDF asset to rasterize
        :type asset: Asset
        :param page: Zero-based page index
        :type page: int
        :param dpi: Output resolution in dots per inch
        :type dpi: int
        :param mime_type: MIME type of the output image (``'image/jpeg'`` or ``'image/png'``)
        :type mime_type: str
        :return: Raster image asset
        :rtype: Asset
        :raises OperatorError: if *page* is out of range or rasterization fails
        """
        try:
            import pdf2image
        except ImportError as e:
            raise OperatorError('pdf2image is required for rasterization; install the pdf extra') from e

        page_count = asset.page_count
        if page < 0 or page >= page_count:
            raise OperatorError(f'Page index {page} is out of range for a PDF with {page_count} pages')

        pil_format = _MIME_TYPE_TO_PIL_FORMAT.get(mime_type)
        if pil_format is None:
            raise OperatorError(f'Unsupported MIME type for rasterization: {mime_type!r}')

        # pdf2image uses 1-based page numbers
        images = pdf2image.convert_from_bytes(
            asset.essence.read(),
            dpi=dpi,
            first_page=page + 1,
            last_page=page + 1,
        )
        if not images:
            raise OperatorError(f'Rasterization produced no output for page {page}')

        pil_image = images[0]
        width, height = pil_image.size

        buf = io.BytesIO()
        pil_image.save(buf, format=pil_format)
        buf.seek(0)

        return Asset(essence=buf, mime_type=mime_type, width=width, height=height)