Files
docintake-gm/processor.py
2026-01-01 21:57:33 -08:00

83 lines
2.4 KiB
Python

#!/usr/bin/env python3
"""Minimal processor prototype: convert a PDF page to an image and run PaddleOCR.
This script is intentionally small and defensive: it checks for missing
dependencies and prints actionable instructions.
"""
from __future__ import annotations
import argparse
import os
import sys
try:
import yaml
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
import numpy as np
except Exception as exc: # pragma: no cover - runtime dependency guard
print("Dependency error:", exc)
print("Please install requirements: pip install -r requirements.txt")
sys.exit(1)
def process_pdf_first_page(pdf_path: str) -> list:
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
if not pages:
raise RuntimeError("No pages returned from pdf2image")
img = pages[0]
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# PaddleOCR accepts numpy arrays for in-memory images
result = ocr.ocr(np.array(img), cls=True)
return result
def load_conf(path: str) -> dict:
with open(path, "r") as fh:
return yaml.safe_load(fh) or {}
def main() -> None:
p = argparse.ArgumentParser(description="Processor prototype")
p.add_argument("--conf", default="conf.yaml", help="Path to conf.yaml")
p.add_argument(
"--input", help="Path to input PDF or input directory (overrides conf)")
args = p.parse_args()
conf = {}
if os.path.exists(args.conf):
conf = load_conf(args.conf)
input_spec = args.input or conf.get("input_file") or conf.get("input_dir")
if not input_spec:
print("No input specified. Set --input or define 'input_file' / 'input_dir' in conf.yaml")
sys.exit(2)
if os.path.isdir(input_spec):
for fname in sorted(os.listdir(input_spec)):
if fname.lower().endswith(".pdf"):
input_spec = os.path.join(input_spec, fname)
break
else:
print("No PDF files found in directory:", input_spec)
sys.exit(3)
if not os.path.exists(input_spec):
print("Input path does not exist:", input_spec)
sys.exit(4)
print("Processing:", input_spec)
try:
res = process_pdf_first_page(input_spec)
except Exception as exc:
print("Processing error:", exc)
sys.exit(5)
print("OCR result (first page):")
for block in res:
print(block)
if __name__ == "__main__":
main()