83 lines
2.4 KiB
Python
83 lines
2.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Minimal processor prototype: convert a PDF page to an image and run PaddleOCR.
|
|
|
|
This script is intentionally small and defensive: it checks for missing
|
|
dependencies and prints actionable instructions.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
|
|
try:
|
|
import yaml
|
|
from pdf2image import convert_from_path
|
|
from paddleocr import PaddleOCR
|
|
import numpy as np
|
|
except Exception as exc: # pragma: no cover - runtime dependency guard
|
|
print("Dependency error:", exc)
|
|
print("Please install requirements: pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
def process_pdf_first_page(pdf_path: str) -> list:
|
|
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
|
|
if not pages:
|
|
raise RuntimeError("No pages returned from pdf2image")
|
|
img = pages[0]
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
|
# PaddleOCR accepts numpy arrays for in-memory images
|
|
result = ocr.ocr(np.array(img), cls=True)
|
|
return result
|
|
|
|
|
|
def load_conf(path: str) -> dict:
|
|
with open(path, "r") as fh:
|
|
return yaml.safe_load(fh) or {}
|
|
|
|
|
|
def main() -> None:
|
|
p = argparse.ArgumentParser(description="Processor prototype")
|
|
p.add_argument("--conf", default="conf.yaml", help="Path to conf.yaml")
|
|
p.add_argument(
|
|
"--input", help="Path to input PDF or input directory (overrides conf)")
|
|
args = p.parse_args()
|
|
|
|
conf = {}
|
|
if os.path.exists(args.conf):
|
|
conf = load_conf(args.conf)
|
|
|
|
input_spec = args.input or conf.get("input_file") or conf.get("input_dir")
|
|
if not input_spec:
|
|
print("No input specified. Set --input or define 'input_file' / 'input_dir' in conf.yaml")
|
|
sys.exit(2)
|
|
|
|
if os.path.isdir(input_spec):
|
|
for fname in sorted(os.listdir(input_spec)):
|
|
if fname.lower().endswith(".pdf"):
|
|
input_spec = os.path.join(input_spec, fname)
|
|
break
|
|
else:
|
|
print("No PDF files found in directory:", input_spec)
|
|
sys.exit(3)
|
|
|
|
if not os.path.exists(input_spec):
|
|
print("Input path does not exist:", input_spec)
|
|
sys.exit(4)
|
|
|
|
print("Processing:", input_spec)
|
|
try:
|
|
res = process_pdf_first_page(input_spec)
|
|
except Exception as exc:
|
|
print("Processing error:", exc)
|
|
sys.exit(5)
|
|
|
|
print("OCR result (first page):")
|
|
for block in res:
|
|
print(block)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|