From d246d2a0d73db44bd631b7c111a367d9c414c9da Mon Sep 17 00:00:00 2001 From: Vuong Hoang Date: Thu, 1 Jan 2026 21:57:33 -0800 Subject: [PATCH] Initial commit --- .gitignore | 22 +++++++++++++ DEVELOPMENT.md | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 56 +++++++++++++++++++++++++++++++ conf.yaml | 11 +++++++ processor.py | 82 +++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 28 ++++++++++++++++ 6 files changed, 285 insertions(+) create mode 100644 .gitignore create mode 100644 DEVELOPMENT.md create mode 100644 README.md create mode 100644 conf.yaml create mode 100644 processor.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b7baaf8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +# Virtual environment +.venv/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +build/ +dist/ +*.egg-info/ + +# Editor dirs +.vscode/ +.idea/ + +# macOS +.DS_Store + +# Logs +*.log diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md new file mode 100644 index 0000000..d2f5822 --- /dev/null +++ b/DEVELOPMENT.md @@ -0,0 +1,86 @@ +# Document Intake (dev notes) + +## Project overview + +This project ingests PDF files of scanned forms from an input folder, extracts structured data using OCR, and stores the results in a MongoDB database. + +## Goals + +- Build a reliable pipeline to convert scanned PDF pages into images and run OCR. +- Extract form fields and normalize values (dates, numbers, names, checkboxes). +- Validate and transform extracted data into a consistent schema for MongoDB. +- Make processing resumable and observable (logs, metrics, retry). + +## Architecture / Flow + +1. Watch input folder for new PDF files (or run batch processor). +2. Convert each PDF page to image(s) using `pdf2image` (requires `poppler`). +3. Run OCR on images using `paddleocr` to get text, segmentation, and confidence. +4. Parse OCR results to locate fields, using heuristics and templates. +5. Validate/normalize extracted values. +6. Insert or upsert documents into MongoDB via `pymongo`. +7. Move processed PDFs to an archive or error folder. + +## Tech stack / dependencies + +- Python 3.14 (project venv: `.venv`) +- OCR: `paddleocr` (and underlying `paddlepaddle`) +- PDF → image: `pdf2image` (requires `poppler` installed on host) +- MongoDB client: `pymongo` +- Image handling: `Pillow` +- Optional: `python-dotenv` for config, `rich` for nicer logs + +System-level requirements: + +- `poppler` (for `pdf2image`) +- Optional GPU drivers if using GPU-enabled `paddlepaddle` builds + +## Environment / setup (dev) + +1. Create and activate virtualenv (already done in this workspace): + +```bash +python3 -m venv .venv +source .venv/bin/activate +``` + +2. Install Python dependencies (we will add `requirements.txt` soon): + +```bash +pip install -r requirements.txt +``` + +3. Configure a `.env` file for MongoDB connection string and paths (example): + +``` +MONGO_URI=mongodb://localhost:27017 +INPUT_DIR=./input +ARCHIVE_DIR=./archive +ERROR_DIR=./error +``` + +## Data model (draft) + +- `documents` collection: + - `_id`: UUID + - `filename`: original PDF filename + - `pages`: list of page objects; each page has `page_number`, `ocr_text`, `fields` + - `extracted_fields`: dict of normalized field names -> values + - `status`: `pending|processed|error` + - `processed_at`, `created_at` + +## Next steps / short-term TODOs + +- Create `requirements.txt` with pinned packages and document system deps. +- Add `README.md` with quickstart run instructions. +- Implement a small prototype script `processor.py` that converts a PDF to images and runs OCR on a single page. +- Add basic unit tests and a sample PDF in `samples/` for development. + +## Notes / caveats + +- `paddleocr` may require selecting CPU vs GPU builds of `paddlepaddle` — document preferred build and install instructions in `requirements.txt` or a separate notes section. +- OCR of scanned forms may require pre-processing (deskewing, denoising) for acceptable accuracy. + +--- + +Created: initial development notes and plan. diff --git a/README.md b/README.md new file mode 100644 index 0000000..2504e73 --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Document Intake — Quickstart + +Prerequisite: you already created the project virtual environment `.venv` in the repository root. + +## macOS system deps + +Install `poppler` (needed by `pdf2image`): + +```bash +brew install poppler +``` + +## Activate the virtual environment + +```bash +source .venv/bin/activate +``` + +## Install Python dependencies + +```bash +pip install -r requirements.txt +``` + +If you need a specific `paddlepaddle` flavor (CPU vs GPU) follow the official install guide before or instead of the line above. + +## Quick verification + +Check that `paddleocr` and `pymongo` import successfully: + +```bash +python -c "import paddleocr; import pymongo; print('imports OK')" +``` + +## Running a processor (prototype) + +We will add a prototype script `processor.py` that: + +- Converts pages from a PDF to images using `pdf2image`. +- Runs OCR on one page with `paddleocr`. +- Prints basic extraction results. + +To run the prototype (once added): + +```bash +python processor.py --input samples/example.pdf +``` + +## Useful files + +- Development notes: [DEVELOPMENT.md](DEVELOPMENT.md) +- Python dependencies: [requirements.txt](requirements.txt) + +--- + +If you want, I can now add a minimal `processor.py` prototype and a `samples/` folder with a placeholder PDF. Which should I do next? diff --git a/conf.yaml b/conf.yaml new file mode 100644 index 0000000..f46842f --- /dev/null +++ b/conf.yaml @@ -0,0 +1,11 @@ +## Configuration for the processor prototype + +# Either point `input_dir` to a folder containing PDFs, or set `input_file` +input_dir: ./input +input_file: "" + +archive_dir: ./archive +error_dir: ./error + +# MongoDB connection (optional for prototype) +mongo_uri: "mongodb://localhost:27017" diff --git a/processor.py b/processor.py new file mode 100644 index 0000000..6556d53 --- /dev/null +++ b/processor.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +"""Minimal processor prototype: convert a PDF page to an image and run PaddleOCR. + +This script is intentionally small and defensive: it checks for missing +dependencies and prints actionable instructions. +""" +from __future__ import annotations + +import argparse +import os +import sys + +try: + import yaml + from pdf2image import convert_from_path + from paddleocr import PaddleOCR + import numpy as np +except Exception as exc: # pragma: no cover - runtime dependency guard + print("Dependency error:", exc) + print("Please install requirements: pip install -r requirements.txt") + sys.exit(1) + + +def process_pdf_first_page(pdf_path: str) -> list: + pages = convert_from_path(pdf_path, first_page=1, last_page=1) + if not pages: + raise RuntimeError("No pages returned from pdf2image") + img = pages[0] + ocr = PaddleOCR(use_angle_cls=True, lang='en') + # PaddleOCR accepts numpy arrays for in-memory images + result = ocr.ocr(np.array(img), cls=True) + return result + + +def load_conf(path: str) -> dict: + with open(path, "r") as fh: + return yaml.safe_load(fh) or {} + + +def main() -> None: + p = argparse.ArgumentParser(description="Processor prototype") + p.add_argument("--conf", default="conf.yaml", help="Path to conf.yaml") + p.add_argument( + "--input", help="Path to input PDF or input directory (overrides conf)") + args = p.parse_args() + + conf = {} + if os.path.exists(args.conf): + conf = load_conf(args.conf) + + input_spec = args.input or conf.get("input_file") or conf.get("input_dir") + if not input_spec: + print("No input specified. Set --input or define 'input_file' / 'input_dir' in conf.yaml") + sys.exit(2) + + if os.path.isdir(input_spec): + for fname in sorted(os.listdir(input_spec)): + if fname.lower().endswith(".pdf"): + input_spec = os.path.join(input_spec, fname) + break + else: + print("No PDF files found in directory:", input_spec) + sys.exit(3) + + if not os.path.exists(input_spec): + print("Input path does not exist:", input_spec) + sys.exit(4) + + print("Processing:", input_spec) + try: + res = process_pdf_first_page(input_spec) + except Exception as exc: + print("Processing error:", exc) + sys.exit(5) + + print("OCR result (first page):") + for block in res: + print(block) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..755c57b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +# Core OCR +paddleocr>=2.7.0 +# PaddlePaddle (choose CPU or GPU build appropriate for your system) +# For CPU-only install: `pip install paddlepaddle` or follow the official install guide +#paddlepaddle>=2.5.0 + +# PDF -> image +pdf2image>=1.16.0 +Pillow>=10.0.0 + +# Database +pymongo>=4.4.0 + +# Utilities +python-dotenv>=1.0.0 +rich>=13.0.0 +tqdm>=4.65.0 + +# Testing / linting (optional) +pytest>=8.0.0 +mypy>=1.5.0 + +# YAML config parsing +PyYAML>=6.0 + +# Notes: +# - `pdf2image` requires the `poppler` system package (e.g. `brew install poppler` on macOS). +# - Select the appropriate `paddlepaddle` wheel for your platform (CPU vs GPU, macOS vs Linux). \ No newline at end of file