Search and extract¶
Runs Document.search(QUERY) and prints every hit with its surrounding
page text — a compact demo of how to pair
SearchHit metadata (page, line, column, match,
snippet) with the page's full text rendering.
Usage¶
--context controls how many characters of surrounding page text to
print (default 200).
Source¶
"""Search a document and print every hit with its surrounding page text.
Usage:
python examples/search_and_extract.py PATH QUERY [--context N]
Opens ``PATH``, runs :meth:`Document.search` with ``QUERY`` and prints one
block per hit — the page number, the snippet the engine returned, and the
first ``N`` characters of the page's surrounding text (default 200).
This example demonstrates three things side-by-side: the high-level
:meth:`Document.search` API, the per-page :meth:`Page.text` rendering,
and the :class:`SearchHit` payload's ``line`` / ``col_start`` / ``match``
/ ``snippet`` fields.
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
import olgadoc
def main(argv: list[str]) -> int:
"""CLI entry point. Returns a process exit code."""
parser = argparse.ArgumentParser(
description="Search a document and print surrounding text per hit.",
)
parser.add_argument("path", type=Path, help="document to search")
parser.add_argument("query", help="substring to look for (case-insensitive)")
parser.add_argument(
"--context",
type=int,
default=200,
help="characters of surrounding page text to print (default: 200)",
)
args = parser.parse_args(argv[1:])
try:
doc = olgadoc.Document.open(str(args.path))
except olgadoc.OlgaError as exc:
print(f"failed to open {args.path}: {exc}", file=sys.stderr)
return 1
hits = doc.search(args.query)
if not hits:
print(f"no hits for {args.query!r} in {args.path.name}")
return 0
print(f"{len(hits)} hit(s) for {args.query!r} in {args.path.name}\n")
for idx, hit in enumerate(hits, start=1):
page = doc.page(hit["page"])
if page is None:
continue
text = page.text()
context = text[: args.context].replace("\n", " ")
if len(text) > args.context:
context += "..."
print(
f"[{idx}] page {hit['page']}"
f" line {hit['line']} col {hit['col_start']}"
f" match={hit['match']!r}"
)
print(f" snippet: {hit['snippet']}")
print(f" context: {context}")
print()
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv))