Source code for madam.iptc

"""
IPTC/IIM metadata processor for JPEG images.

Reads and writes IPTC Application Record (record 2) fields embedded in
the JPEG APP13/Photoshop 3.0 block.  No additional dependencies beyond
Pillow are required.
"""

from __future__ import annotations

import io
import struct
from collections.abc import Iterable, Mapping
from typing import IO, Any

import PIL.Image
import PIL.IptcImagePlugin

from madam.core import MetadataProcessor, UnsupportedFormatError

# Mapping from IPTC Application Record dataset numbers to friendly names.
# Only dataset numbers that map to a single value are listed here; the
# special case of keywords (repeatable) is handled separately.
_DATASET_TO_KEY: dict[int, str] = {
    5: 'object_name',
    15: 'category',
    40: 'instructions',
    80: 'author',
    85: 'author_title',
    90: 'city',
    92: 'sublocation',
    95: 'state',
    100: 'country_code',
    101: 'country',
    105: 'headline',
    110: 'credit',
    115: 'source',
    116: 'copyright',
    120: 'caption',
}

# Repeatable fields: stored as lists of strings.
_REPEATABLE_DATASETS: frozenset[int] = frozenset({25})  # keywords

_KEY_TO_DATASET: dict[str, int] = {v: k for k, v in _DATASET_TO_KEY.items()}
_KEY_TO_DATASET['keywords'] = 25


def _make_iptc_record(dataset: int, data: str | bytes) -> bytes:
    """Return a single IPTC IIM record for Application Record 2."""
    if isinstance(data, str):
        data = data.encode('latin-1', errors='replace')
    return bytes([0x1C, 2, dataset]) + struct.pack('>H', len(data)) + data


def _make_8bim(resource_type: int, data: bytes) -> bytes:
    """Wrap *data* in a Photoshop 8BIM resource block."""
    name = b'\x00\x00'  # empty Pascal string with even-alignment padding
    block = b'8BIM' + struct.pack('>H', resource_type) + name + struct.pack('>I', len(data)) + data
    if len(data) % 2 == 1:
        block += b'\x00'  # pad to even boundary
    return block


def _build_app13(iptc_records: bytes) -> bytes:
    """Wrap IPTC IIM records in a JPEG APP13 marker."""
    photoshop = b'Photoshop 3.0\x00' + _make_8bim(0x0404, iptc_records)
    length = 2 + len(photoshop)
    return b'\xff\xed' + struct.pack('>H', length) + photoshop



[docs] class IPTCMetadataProcessor(MetadataProcessor): """ Reads and writes IPTC/IIM metadata embedded in JPEG files. IPTC data is stored in the JPEG APP13 (Photoshop 3.0) block. Only JPEG is supported; attempts to read non-JPEG data raise :class:`~madam.core.UnsupportedFormatError`. Supported metadata keys under the ``'iptc'`` namespace: * ``object_name`` — Object Name (dataset 5) * ``category`` — Category (dataset 15) * ``keywords`` — Keywords, list of strings (dataset 25, repeatable) * ``instructions`` — Special Instructions (dataset 40) * ``author`` — By-line / Author (dataset 80) * ``author_title`` — By-line Title (dataset 85) * ``city`` — City (dataset 90) * ``sublocation`` — Sublocation (dataset 92) * ``state`` — Province/State (dataset 95) * ``country_code`` — Country Code (dataset 100) * ``country`` — Country (dataset 101) * ``headline`` — Headline (dataset 105) * ``credit`` — Credit (dataset 110) * ``source`` — Source (dataset 115) * ``copyright`` — Copyright Notice (dataset 116) * ``caption`` — Caption/Abstract (dataset 120) .. versionadded:: 0.24 """
[docs] def __init__(self, config: Mapping[str, Any] | None = None) -> None: """ Initializes a new ``IPTCMetadataProcessor``. :param config: Mapping with settings. """ super().__init__(config)
@property def formats(self) -> Iterable[str]: return {'iptc'}
[docs] def read(self, file: IO) -> Mapping[str, Mapping]: """ Reads IPTC metadata from a JPEG file. :param file: Readable binary file-like object containing JPEG data. :type file: IO :return: Mapping with a single ``'iptc'`` key whose value is a dict of decoded metadata fields. Returns an empty mapping when the file contains no IPTC data. :rtype: Mapping[str, Mapping] :raises UnsupportedFormatError: if the data is not a JPEG file. """ data = file.read() if not data.startswith(b'\xff\xd8'): raise UnsupportedFormatError('IPTC metadata is only supported for JPEG files.') try: with PIL.Image.open(io.BytesIO(data)) as image: raw = PIL.IptcImagePlugin.getiptcinfo(image) except Exception as exc: raise UnsupportedFormatError(f'Could not read IPTC data: {exc}') from exc if not raw: return {} iptc: dict[str, Any] = {} for (record, dataset), value in raw.items(): if record != 2: continue if dataset in _REPEATABLE_DATASETS: # Convert list or single bytes to a list of strings. if isinstance(value, list): iptc['keywords'] = [v.decode('latin-1') for v in value] else: iptc['keywords'] = [value.decode('latin-1')] elif dataset in _DATASET_TO_KEY: key = _DATASET_TO_KEY[dataset] raw_bytes = value if isinstance(value, bytes) else value[0] iptc[key] = raw_bytes.decode('latin-1') if not iptc: return {} return {'iptc': iptc}
[docs] def strip(self, file: IO) -> IO: """ Returns a copy of the JPEG file with all IPTC (APP13) data removed. :param file: Readable binary file-like object containing JPEG data. :type file: IO :return: File-like object with IPTC data removed. :rtype: IO :raises UnsupportedFormatError: if the data is not a JPEG file. """ data = file.read() if not data.startswith(b'\xff\xd8'): raise UnsupportedFormatError('IPTC strip is only supported for JPEG files.') result = io.BytesIO() pos = 0 length = len(data) while pos < length: if data[pos] != 0xFF: # Raw entropy-coded data — copy to end result.write(data[pos:]) break marker_byte = data[pos + 1] marker = data[pos : pos + 2] if marker_byte in (0xD8, 0xD9, 0x01) or 0xD0 <= marker_byte <= 0xD7: # Stand-alone marker result.write(marker) pos += 2 else: seg_length = struct.unpack('>H', data[pos + 2 : pos + 4])[0] seg_end = pos + 2 + seg_length if marker == b'\xff\xed': # APP13 — skip the entire segment pos = seg_end else: result.write(data[pos:seg_end]) pos = seg_end result.seek(0) return result
[docs] def combine(self, file: IO, metadata: Mapping[str, Mapping]) -> IO: """ Returns a copy of the JPEG file with IPTC metadata embedded. Existing IPTC data is replaced. Only the ``'iptc'`` key of *metadata* is used; other keys are ignored. :param file: Readable binary file-like object containing JPEG data. :type file: IO :param metadata: Mapping with an ``'iptc'`` key whose value is a dict of IPTC field names and values. :type metadata: Mapping :return: File-like object with IPTC data embedded. :rtype: IO :raises UnsupportedFormatError: if *metadata* contains an unknown IPTC format key. """ for fmt in metadata: if fmt not in self.formats: raise UnsupportedFormatError(f'Metadata format {fmt!r} is not supported.') # Strip any existing IPTC first. stripped = self.strip(file) stripped.seek(0) jpeg_data = stripped.read() iptc_fields = metadata.get('iptc', {}) if not iptc_fields: result = io.BytesIO(jpeg_data) result.seek(0) return result # Build IPTC IIM records. iptc_bytes = b'' for key, value in iptc_fields.items(): dataset = _KEY_TO_DATASET.get(key) if dataset is None: continue if key == 'keywords': kw_list = [value] if isinstance(value, str) else list(value) for kw in kw_list: iptc_bytes += _make_iptc_record(dataset, kw) else: iptc_bytes += _make_iptc_record(dataset, str(value)) app13 = _build_app13(iptc_bytes) # Insert APP13 immediately after the SOI marker (first 2 bytes). result = io.BytesIO(jpeg_data[:2] + app13 + jpeg_data[2:]) result.seek(0) return result