Skip to content

Quickstart example

A "hello world" that opens one or more documents and prints the detected format, page count, title and a 60-character preview of every page's text.

Usage

python examples/quickstart.py PATH [PATH ...]

PATH can be any supported document (PDF, DOCX, XLSX, HTML). The exit code is the worst status across all inputs.

Source

"""Quickstart — open a document and print a one-line summary per page.

Usage:
    python examples/quickstart.py PATH [PATH ...]

Each ``PATH`` can be any supported document (PDF, DOCX, XLSX, HTML). The
script opens the document, prints the detected format and page count, and
shows the first 60 characters of every page's extracted text. It is meant
as a literal "hello world" for the library — copy the body into a REPL to
start exploring.
"""

from __future__ import annotations

import sys
from pathlib import Path

import olgadoc


def summarise(path: Path) -> int:
    """Print a compact summary of ``path``. Returns ``0`` on success."""
    try:
        doc = olgadoc.Document.open(str(path))
    except olgadoc.OlgaError as exc:
        print(f"[{path.name}] failed to open: {exc}", file=sys.stderr)
        return 1

    print(f"=== {path.name} ===")
    print(f"  format      : {doc.format}")
    print(f"  page_count  : {doc.page_count}")
    print(f"  file_size   : {doc.file_size} bytes")
    print(f"  title       : {doc.title or '(none)'}")
    print(f"  encrypted   : {doc.encrypted}")

    for page in doc.pages():
        preview = page.text().strip().replace("\n", " ")
        if len(preview) > 60:
            preview = preview[:57] + "..."
        print(f"  page {page.number:>3}  : {preview or '(empty)'}")
    return 0


def main(argv: list[str]) -> int:
    """CLI entry point. Returns a process exit code."""
    if len(argv) < 2:
        print(__doc__, file=sys.stderr)
        return 2
    return max(summarise(Path(arg)) for arg in argv[1:])


if __name__ == "__main__":
    raise SystemExit(main(sys.argv))