#!/usr/bin/env python3 """Minimal processor prototype: convert a PDF page to an image and run PaddleOCR. This script is intentionally small and defensive: it checks for missing dependencies and prints actionable instructions. """ from __future__ import annotations import argparse import os import sys try: import yaml from pdf2image import convert_from_path from paddleocr import PaddleOCR import numpy as np except Exception as exc: # pragma: no cover - runtime dependency guard print("Dependency error:", exc) print("Please install requirements: pip install -r requirements.txt") sys.exit(1) def process_pdf_first_page(pdf_path: str) -> list: pages = convert_from_path(pdf_path, first_page=1, last_page=1) if not pages: raise RuntimeError("No pages returned from pdf2image") img = pages[0] ocr = PaddleOCR(use_angle_cls=True, lang='en') # PaddleOCR accepts numpy arrays for in-memory images result = ocr.ocr(np.array(img), cls=True) return result def load_conf(path: str) -> dict: with open(path, "r") as fh: return yaml.safe_load(fh) or {} def main() -> None: p = argparse.ArgumentParser(description="Processor prototype") p.add_argument("--conf", default="conf.yaml", help="Path to conf.yaml") p.add_argument( "--input", help="Path to input PDF or input directory (overrides conf)") args = p.parse_args() conf = {} if os.path.exists(args.conf): conf = load_conf(args.conf) input_spec = args.input or conf.get("input_file") or conf.get("input_dir") if not input_spec: print("No input specified. Set --input or define 'input_file' / 'input_dir' in conf.yaml") sys.exit(2) if os.path.isdir(input_spec): for fname in sorted(os.listdir(input_spec)): if fname.lower().endswith(".pdf"): input_spec = os.path.join(input_spec, fname) break else: print("No PDF files found in directory:", input_spec) sys.exit(3) if not os.path.exists(input_spec): print("Input path does not exist:", input_spec) sys.exit(4) print("Processing:", input_spec) try: res = process_pdf_first_page(input_spec) except Exception as exc: print("Processing error:", exc) sys.exit(5) print("OCR result (first page):") for block in res: print(block) if __name__ == "__main__": main()