--- a/scripts/ocr_caller.py +++ b/scripts/ocr_caller.py @@ -7,8 +7,11 @@ import argparse import base64 +import glob import json import os +import shutil +import subprocess import sys import tempfile import time @@ -148,6 +151,56 @@ # --------------------------------------------------------------------------- +# Local Tesseract fallback +# --------------------------------------------------------------------------- + +def _tesseract_available() -> bool: + return shutil.which("tesseract") is not None + + +def _run_tesseract(file_path: str, file_type: int) -> dict: + """Run local Tesseract OCR as a fallback when the remote API fails.""" + try: + tmpdir = tempfile.mkdtemp(prefix="ocr_tess_") + image_paths = [] + + if file_type == 1: # PDF — convert pages to images via pdftoppm + if not shutil.which("pdftoppm"): + return _error_result("FALLBACK_ERROR: pdftoppm not found; cannot convert PDF") + prefix = os.path.join(tmpdir, "page") + subprocess.run( + ["pdftoppm", "-png", "-r", "300", file_path, prefix], + check=True, capture_output=True, + ) + image_paths = sorted(glob.glob(f"{prefix}*.png")) + if not image_paths: + return _error_result("FALLBACK_ERROR: pdftoppm produced no images") + else: + image_paths = [file_path] + + all_text_parts = [] + raw_pages = [] + for img in image_paths: + proc = subprocess.run( + ["tesseract", img, "stdout", "--psm", "3"], + capture_output=True, text=True, + ) + page_text = proc.stdout.strip() + all_text_parts.append(page_text) + raw_pages.append({"image": img, "text": page_text, "stderr": proc.stderr.strip()}) + + full_text = "\n".join(all_text_parts) + return { + "ok": True, + "text": full_text, + "result": {"engine": "tesseract-local", "pages": raw_pages}, + "error": None, + } + except Exception as exc: + return _error_result(f"FALLBACK_ERROR: {exc}") + + +# --------------------------------------------------------------------------- # Output helpers # --------------------------------------------------------------------------- @@ -211,6 +264,15 @@ # --- Call API ----------------------------------------------------- result = _call_api(config, image_b64, file_type) + # --- Fallback to local Tesseract if remote API failed ------------- + if not result["ok"] and _tesseract_available() and args.file_path: + print( + f"Remote API failed ({result['error']['message']}); " + "falling back to local Tesseract…", + file=sys.stderr, + ) + result = _run_tesseract(args.file_path, file_type) + _emit(result, args)