Skip to content

Batch processability

Walks a directory tree recursively, calls Document.processability() on every supported file, and prints a one-line verdict per document. With --json, emits one JSON object per line — perfect for jq or a downstream ingestion pipeline.

Exits non-zero when any document is blocked or errors out, so this doubles as a pre-flight gate in CI.

Usage

python examples/batch_processability.py ROOT          # pretty text
python examples/batch_processability.py ROOT --json   # JSONL output

Source

"""Batch health-check a directory tree of documents.

Usage:
    python examples/batch_processability.py ROOT [--json]

Walks ``ROOT`` recursively, opens every supported document, and prints a
one-line health verdict per file — ``ok``, ``degraded`` (with the list of
degradation kinds) or ``blocked`` (with the blocker kinds). With
``--json``, emits one JSON object per line instead, suitable for piping
into ``jq`` or a downstream ingestion pipeline.

The exit code is non-zero when at least one document is blocked, so this
doubles as a pre-flight gate for batch processing jobs.
"""

from __future__ import annotations

import argparse
import dataclasses
import json
import sys
from pathlib import Path
from typing import Iterator

import olgadoc

SUPPORTED_SUFFIXES = {".pdf", ".docx", ".xlsx", ".html", ".htm"}


@dataclasses.dataclass(frozen=True)
class Verdict:
    """One processability verdict for a single file."""

    path: str
    health: str  # "ok" | "degraded" | "blocked" | "error"
    pages_total: int = 0
    pages_with_content: int = 0
    warning_count: int = 0
    blockers: tuple[str, ...] = ()
    degradations: tuple[str, ...] = ()
    error: str | None = None

    def to_dict(self) -> dict[str, object]:
        """Render as a plain JSON-serialisable dict."""
        return dataclasses.asdict(self)


def _iter_documents(root: Path) -> Iterator[Path]:
    """Yield every file under ``root`` whose suffix we can open."""
    for entry in sorted(root.rglob("*")):
        if entry.is_file() and entry.suffix.lower() in SUPPORTED_SUFFIXES:
            yield entry


def _check(path: Path) -> Verdict:
    """Run :meth:`Document.processability` and pack the verdict."""
    try:
        doc = olgadoc.Document.open(str(path))
    except olgadoc.OlgaError as exc:
        return Verdict(path=str(path), health="error", error=str(exc))
    report = doc.processability()
    return Verdict(
        path=str(path),
        health=report.health,
        pages_total=report.pages_total,
        pages_with_content=report.pages_with_content,
        warning_count=report.warning_count,
        blockers=tuple(issue["kind"] for issue in report.blockers),
        degradations=tuple(issue["kind"] for issue in report.degradations),
    )


def _format_line(verdict: Verdict) -> str:
    """Render a verdict as a compact aligned line."""
    health = verdict.health.upper()
    if health == "OK":
        return f"{health:<9} {verdict.path}"
    if health == "ERROR":
        return f"{health:<9} {verdict.path} ({verdict.error})"
    issues = verdict.blockers if health == "BLOCKED" else verdict.degradations
    issues_txt = ", ".join(issues) or "-"
    return f"{health:<9} {verdict.path} [{issues_txt}]"


def main(argv: list[str]) -> int:
    """CLI entry point. Returns a process exit code."""
    parser = argparse.ArgumentParser(
        description="Batch-check document processability.",
    )
    parser.add_argument("root", type=Path, help="directory to walk recursively")
    parser.add_argument(
        "--json",
        action="store_true",
        help="emit one JSON object per line instead of pretty text",
    )
    args = parser.parse_args(argv[1:])

    if not args.root.is_dir():
        print(f"{args.root}: not a directory", file=sys.stderr)
        return 2

    worst = "ok"
    ranks = {"ok": 0, "degraded": 1, "error": 2, "blocked": 3}
    for path in _iter_documents(args.root):
        verdict = _check(path)
        if args.json:
            print(json.dumps(verdict.to_dict()))
        else:
            print(_format_line(verdict))
        if ranks.get(verdict.health, 0) > ranks.get(worst, 0):
            worst = verdict.health

    # Exit non-zero if any document blocked processing — useful in CI gates.
    return 1 if worst in {"blocked", "error"} else 0


if __name__ == "__main__":
    raise SystemExit(main(sys.argv))