Initial commit
This commit is contained in:
82
processor.py
Normal file
82
processor.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Minimal processor prototype: convert a PDF page to an image and run PaddleOCR.
|
||||
|
||||
This script is intentionally small and defensive: it checks for missing
|
||||
dependencies and prints actionable instructions.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
try:
|
||||
import yaml
|
||||
from pdf2image import convert_from_path
|
||||
from paddleocr import PaddleOCR
|
||||
import numpy as np
|
||||
except Exception as exc: # pragma: no cover - runtime dependency guard
|
||||
print("Dependency error:", exc)
|
||||
print("Please install requirements: pip install -r requirements.txt")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def process_pdf_first_page(pdf_path: str) -> list:
|
||||
pages = convert_from_path(pdf_path, first_page=1, last_page=1)
|
||||
if not pages:
|
||||
raise RuntimeError("No pages returned from pdf2image")
|
||||
img = pages[0]
|
||||
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
||||
# PaddleOCR accepts numpy arrays for in-memory images
|
||||
result = ocr.ocr(np.array(img), cls=True)
|
||||
return result
|
||||
|
||||
|
||||
def load_conf(path: str) -> dict:
|
||||
with open(path, "r") as fh:
|
||||
return yaml.safe_load(fh) or {}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(description="Processor prototype")
|
||||
p.add_argument("--conf", default="conf.yaml", help="Path to conf.yaml")
|
||||
p.add_argument(
|
||||
"--input", help="Path to input PDF or input directory (overrides conf)")
|
||||
args = p.parse_args()
|
||||
|
||||
conf = {}
|
||||
if os.path.exists(args.conf):
|
||||
conf = load_conf(args.conf)
|
||||
|
||||
input_spec = args.input or conf.get("input_file") or conf.get("input_dir")
|
||||
if not input_spec:
|
||||
print("No input specified. Set --input or define 'input_file' / 'input_dir' in conf.yaml")
|
||||
sys.exit(2)
|
||||
|
||||
if os.path.isdir(input_spec):
|
||||
for fname in sorted(os.listdir(input_spec)):
|
||||
if fname.lower().endswith(".pdf"):
|
||||
input_spec = os.path.join(input_spec, fname)
|
||||
break
|
||||
else:
|
||||
print("No PDF files found in directory:", input_spec)
|
||||
sys.exit(3)
|
||||
|
||||
if not os.path.exists(input_spec):
|
||||
print("Input path does not exist:", input_spec)
|
||||
sys.exit(4)
|
||||
|
||||
print("Processing:", input_spec)
|
||||
try:
|
||||
res = process_pdf_first_page(input_spec)
|
||||
except Exception as exc:
|
||||
print("Processing error:", exc)
|
||||
sys.exit(5)
|
||||
|
||||
print("OCR result (first page):")
|
||||
for block in res:
|
||||
print(block)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user