Initial commit

2026-01-01 21:57:33 -08:00
commit d246d2a0d7
6 changed files with 285 additions and 0 deletions
--- a/processor.py
+++ b/processor.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""Minimal processor prototype: convert a PDF page to an image and run PaddleOCR.
+
+This script is intentionally small and defensive: it checks for missing
+dependencies and prints actionable instructions.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+try:
+    import yaml
+    from pdf2image import convert_from_path
+    from paddleocr import PaddleOCR
+    import numpy as np
+except Exception as exc:  # pragma: no cover - runtime dependency guard
+    print("Dependency error:", exc)
+    print("Please install requirements: pip install -r requirements.txt")
+    sys.exit(1)
+
+
+def process_pdf_first_page(pdf_path: str) -> list:
+    pages = convert_from_path(pdf_path, first_page=1, last_page=1)
+    if not pages:
+        raise RuntimeError("No pages returned from pdf2image")
+    img = pages[0]
+    ocr = PaddleOCR(use_angle_cls=True, lang='en')
+    # PaddleOCR accepts numpy arrays for in-memory images
+    result = ocr.ocr(np.array(img), cls=True)
+    return result
+
+
+def load_conf(path: str) -> dict:
+    with open(path, "r") as fh:
+        return yaml.safe_load(fh) or {}
+
+
+def main() -> None:
+    p = argparse.ArgumentParser(description="Processor prototype")
+    p.add_argument("--conf", default="conf.yaml", help="Path to conf.yaml")
+    p.add_argument(
+        "--input", help="Path to input PDF or input directory (overrides conf)")
+    args = p.parse_args()
+
+    conf = {}
+    if os.path.exists(args.conf):
+        conf = load_conf(args.conf)
+
+    input_spec = args.input or conf.get("input_file") or conf.get("input_dir")
+    if not input_spec:
+        print("No input specified. Set --input or define 'input_file' / 'input_dir' in conf.yaml")
+        sys.exit(2)
+
+    if os.path.isdir(input_spec):
+        for fname in sorted(os.listdir(input_spec)):
+            if fname.lower().endswith(".pdf"):
+                input_spec = os.path.join(input_spec, fname)
+                break
+        else:
+            print("No PDF files found in directory:", input_spec)
+            sys.exit(3)
+
+    if not os.path.exists(input_spec):
+        print("Input path does not exist:", input_spec)
+        sys.exit(4)
+
+    print("Processing:", input_spec)
+    try:
+        res = process_pdf_first_page(input_spec)
+    except Exception as exc:
+        print("Processing error:", exc)
+        sys.exit(5)
+
+    print("OCR result (first page):")
+    for block in res:
+        print(block)
+
+
+if __name__ == "__main__":
+    main()