Spaces:

mr3vial
/

paleo-hebrew-project

Sleeping

App Files Files Community

Niiaz Bureev commited on Feb 26

Commit

8816141

1 Parent(s): 9afba6d

added draft of demo

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +504 -0
requirements.txt +13 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .ipynb_checkpoints/

app.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import os
+import gc
+import json
+import time
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from PIL import Image, ImageDraw
+import gradio as gr
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# =============================================================================
+# Configuration via env vars (edit in Space Settings -> Variables if needed)
+# =============================================================================
+DET_REPO_ID = os.getenv("DET_REPO_ID", "mr3vial/paleo-hebrew-yolo")
+DET_FILENAME = os.getenv("DET_FILENAME", "best.onnx")   # recommended
+DET_FALLBACK_PT = os.getenv("DET_FALLBACK_PT", "best.pt")
+CLS_REPO_ID = os.getenv("CLS_REPO_ID", "mr3vial/paleo-hebrew-convnext")
+CLS_WEIGHTS = os.getenv("CLS_WEIGHTS", "model.safetensors")
+CLS_CLASSES = os.getenv("CLS_CLASSES", "classes.txt")
+CLS_MODEL_NAME = os.getenv("CLS_MODEL_NAME", "convnext_large")
+MT5_BOX2HE_REPO = os.getenv("MT5_BOX2HE_REPO", "mr3vial/paleo-hebrew-mt5-post-ocr-processing")
+MT5_BOX2EN_REPO = os.getenv("MT5_BOX2EN_REPO", "mr3vial/paleo-hebrew-mt5-translate")
+FEEDBACK_PATH = os.getenv("FEEDBACK_PATH", "/tmp/feedback.jsonl")
+EXAMPLES_DIR = Path(__file__).parent / "examples"
+# =============================================================================
+# Helpers
+# =============================================================================
+def now_ts() -> str:
+    return time.strftime("%Y-%m-%d %H:%M:%S")
+def normalize_spaces(s: str) -> str:
+    return " ".join((s or "").strip().split())
+def get_device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def safe_int(x: float, default: int = 0) -> int:
+    try:
+        return int(round(float(x)))
+    except Exception:
+        return default
+def sort_boxes_reading_order(boxes_xyxy: List[List[float]], rtl: bool = True) -> List[int]:
+    """
+    Rough reading order:
+      - sort by y-center, then within a line by x-center (rtl: desc, ltr: asc).
+    """
+    if not boxes_xyxy:
+        return []
+    centers = []
+    heights = []
+    for i, (x1, y1, x2, y2) in enumerate(boxes_xyxy):
+        xc = 0.5 * (x1 + x2)
+        yc = 0.5 * (y1 + y2)
+        centers.append((i, xc, yc))
+        heights.append(max(1.0, y2 - y1))
+    line_tol = float(np.median(heights)) * 0.6
+    centers_sorted = sorted(centers, key=lambda t: t[2])
+    lines: List[List[Tuple[int, float, float]]] = []
+    for item in centers_sorted:
+        if not lines:
+            lines.append([item])
+            continue
+        _, _, yc = item
+        prev_line = lines[-1]
+        prev_y = float(np.mean([p[2] for p in prev_line]))
+        if abs(yc - prev_y) <= line_tol:
+            prev_line.append(item)
+        else:
+            lines.append([item])
+    idxs: List[int] = []
+    for line in lines:
+        line_sorted = sorted(line, key=lambda t: t[1], reverse=rtl)
+        idxs.extend([i for i, _, _ in line_sorted])
+    return idxs
+# =============================================================================
+# Load models (Detector, Classifier, mT5)
+# =============================================================================
+@torch.no_grad()
+def load_detector():
+    from ultralytics import YOLO
+    try:
+        det_path = hf_hub_download(repo_id=DET_REPO_ID, filename=DET_FILENAME)
+        return YOLO(det_path), f"{DET_REPO_ID}/{DET_FILENAME}"
+    except Exception:
+        pt_path = hf_hub_download(repo_id=DET_REPO_ID, filename=DET_FALLBACK_PT)
+        return YOLO(pt_path), f"{DET_REPO_ID}/{DET_FALLBACK_PT} (PT fallback)"
+@torch.no_grad()
+def load_classifier():
+    import timm
+    from safetensors.torch import load_file as safetensors_load
+    device = get_device()
+    classes_path = hf_hub_download(repo_id=CLS_REPO_ID, filename=CLS_CLASSES)
+    with open(classes_path, "r", encoding="utf-8") as f:
+        raw = [ln.strip() for ln in f if ln.strip()]
+    letters: List[str] = []
+    for ln in raw:
+        if "_" in ln:
+            letters.append(ln.split("_")[-1])
+        else:
+            letters.append(ln)
+    weights_path = hf_hub_download(repo_id=CLS_REPO_ID, filename=CLS_WEIGHTS)
+    model = timm.create_model(CLS_MODEL_NAME, pretrained=False, num_classes=len(letters))
+    state = safetensors_load(weights_path)
+    new_state = {}
+    for k, v in state.items():
+        nk = k
+        for pref in ("module.", "model."):
+            if nk.startswith(pref):
+                nk = nk[len(pref):]
+        new_state[nk] = v
+    model.load_state_dict(new_state, strict=False)
+    model.eval().to(device)
+    return model, letters, f"{CLS_REPO_ID}/{CLS_WEIGHTS}"
+def preprocess_crop_imagenet(pil: Image.Image) -> torch.Tensor:
+    arr = np.asarray(pil.convert("RGB"), dtype=np.float32) / 255.0
+    t = torch.from_numpy(arr).permute(2, 0, 1).contiguous()
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+    return (t - mean) / std
+@torch.no_grad()
+def load_mt5(repo_id: str):
+    device = get_device()
+    # Memory optimization: Load in bfloat16 to fit multiple models in Space RAM
+    dtype = torch.bfloat16 if (device == "cpu" or torch.cuda.is_bf16_supported()) else torch.float16
+    tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        repo_id,
+        torch_dtype=dtype,
+    ).to(device).eval()
+    return tok, model
+@torch.no_grad()
+def mt5_generate(tok, model, text: str, max_new_tokens: int = 128) -> str:
+    device = get_device()
+    inp = tok([text], return_tensors="pt", truncation=True, max_length=2048).to(device)
+    out = model.generate(**inp, max_new_tokens=max_new_tokens, num_beams=4, do_sample=False)
+    return normalize_spaces(tok.batch_decode(out, skip_special_tokens=True)[0])
+# =============================================================================
+# Optional Hebrew->English translators (user-selectable)
+# =============================================================================
+TRANSLATOR_CHOICES = [
+    ("None (use direct box→en)", "none"),
+    ("OPUS-MT he→en (Helsinki-NLP/opus-mt-tc-big-he-en)", "opus"),
+    ("NLLB-200 distilled 600M (he→en) [CC-BY-NC]", "nllb"),
+    ("M2M100 418M (he→en)", "m2m100"),
+]
+_TRANSLATOR_CACHE: Dict[str, Tuple[Any, Any, str]] = {}  # kind -> (tok, model, kind)
+@torch.no_grad()
+def get_he2en_translator(kind: str):
+    global _TRANSLATOR_CACHE
+    if kind in _TRANSLATOR_CACHE:
+        return _TRANSLATOR_CACHE[kind]
+    # Memory Safety: Clear old translators from memory before loading a new one
+    _TRANSLATOR_CACHE.clear()
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    device = get_device()
+    if kind == "opus":
+        from transformers import MarianMTModel, MarianTokenizer
+        repo = "Helsinki-NLP/opus-mt-tc-big-he-en"
+        tok = MarianTokenizer.from_pretrained(repo)
+        model = MarianMTModel.from_pretrained(repo).to(device).eval()
+    elif kind == "nllb":
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+        repo = "facebook/nllb-200-distilled-600M"
+        tok = AutoTokenizer.from_pretrained(repo, src_lang="heb_Hebr")
+        model = AutoModelForSeq2SeqLM.from_pretrained(repo).to(device).eval()
+    elif kind == "m2m100":
+        from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+        repo = "facebook/m2m100_418M"
+        tok = M2M100Tokenizer.from_pretrained(repo)
+        tok.src_lang = "he"
+        model = M2M100ForConditionalGeneration.from_pretrained(repo).to(device).eval()
+    else:
+        _TRANSLATOR_CACHE[kind] = (None, None, "none")
+        return _TRANSLATOR_CACHE[kind]
+    _TRANSLATOR_CACHE[kind] = (tok, model, kind)
+    return _TRANSLATOR_CACHE[kind]
+@torch.no_grad()
+def translate_he_to_en(text_he: str, kind: str) -> str:
+    if kind == "none":
+        return ""
+    tok, model, kind = get_he2en_translator(kind)
+    device = get_device()
+    if kind == "opus":
+        batch = tok([text_he], return_tensors="pt", padding=True, truncation=True).to(device)
+        out = model.generate(**batch, max_new_tokens=128, num_beams=4)
+        return normalize_spaces(tok.batch_decode(out, skip_special_tokens=True)[0])
+    if kind == "nllb":
+        batch = tok([text_he], return_tensors="pt", truncation=True, max_length=512).to(device)
+        out = model.generate(**batch, forced_bos_token_id=tok.lang_code_to_id["eng_Latn"], max_new_tokens=128, num_beams=4)
+        return normalize_spaces(tok.batch_decode(out, skip_special_tokens=True)[0])
+    if kind == "m2m100":
+        batch = tok([text_he], return_tensors="pt", truncation=True, max_length=512).to(device)
+        out = model.generate(**batch, forced_bos_token_id=tok.get_lang_id("en"), max_new_tokens=128, num_beams=4)
+        return normalize_spaces(tok.batch_decode(out, skip_special_tokens=True)[0])
+    return ""
+# =============================================================================
+# Pipeline Components
+# =============================================================================
+@dataclass
+class Loaded:
+    det: Any
+    det_name: str
+    cls: Any
+    cls_letters: List[str]
+    cls_name: str
+    mt5_he_tok: Any
+    mt5_he: Any
+    mt5_en_tok: Any
+    mt5_en: Any
+LOADED: Optional[Loaded] = None
+def ensure_loaded() -> Loaded:
+    global LOADED
+    if LOADED is not None:
+        return LOADED
+    det, det_name = load_detector()
+    cls, letters, cls_name = load_classifier()
+    he_tok, he_model = load_mt5(MT5_BOX2HE_REPO)
+    en_tok, en_model = load_mt5(MT5_BOX2EN_REPO)
+    LOADED = Loaded(
+        det=det, det_name=det_name,
+        cls=cls, cls_letters=letters, cls_name=cls_name,
+        mt5_he_tok=he_tok, mt5_he=he_model,
+        mt5_en_tok=en_tok, mt5_en=en_model,
+    )
+    return LOADED
+def yolo_predict(det_model, pil: Image.Image, conf: float, iou: float, max_det: int):
+    # Direct inference, no tempfile needed
+    res = det_model.predict(
+        source=pil,
+        conf=float(conf),
+        iou=float(iou),
+        max_det=int(max_det),
+        verbose=False,
+    )
+    r0 = res[0]
+    if r0.boxes is None or len(r0.boxes) == 0:
+        return [], []
+    xyxy = r0.boxes.xyxy.detach().float().cpu().numpy().tolist()
+    scores = r0.boxes.conf.detach().float().cpu().numpy().tolist()
+    return xyxy, scores
+@torch.no_grad()
+def classify_crops(cls_model, letters: List[str], pil: Image.Image, boxes_xyxy, pad: float, topk: int):
+    device = get_device()
+    W, H = pil.size
+    crops = []
+    for (x1, y1, x2, y2) in boxes_xyxy:
+        w = max(1.0, x2 - x1)
+        h = max(1.0, y2 - y1)
+        p = int(round(max(w, h) * float(pad)))
+        a, b = max(0, safe_int(x1) - p), max(0, safe_int(y1) - p)
+        c, d = min(W, safe_int(x2) + p), min(H, safe_int(y2) + p)
+        crops.append(pil.crop((a, b, c, d)).resize((224, 224), Image.BICUBIC))
+    if not crops:
+        return [], [], []
+    batch = torch.stack([preprocess_crop_imagenet(c) for c in crops], dim=0).to(device)
+    logits = cls_model(batch)
+    probs = torch.softmax(logits, dim=-1).detach().float().cpu().numpy()
+    topk_list, top1_letters = [], []
+    for i in range(len(crops)):
+        p = probs[i]
+        idx = np.argsort(-p)[: max(1, int(topk))]
+        row = [(letters[j], float(p[j])) for j in idx]
+        topk_list.append(row)
+        top1_letters.append(row[0][0] if row else "?")
+    crop_gallery = [(crops[i], f"{i+1:02d}: {top1_letters[i]}") for i in range(len(crops))]
+    return topk_list, top1_letters, crop_gallery
+def build_source_text(boxes_xyxy: List[List[float]], scores: List[float], topk: List[List[Tuple[str, float]]], rtl: bool) -> str:
+    idxs = sort_boxes_reading_order(boxes_xyxy, rtl=rtl)
+    lines = ["[BOXES_CLS_V3]", f"n={len(idxs)} coord_norm=none"]
+    for j, i in enumerate(idxs, start=1):
+        x1, y1, x2, y2 = boxes_xyxy[i]
+        sc = scores[i] if i < len(scores) else 1.0
+        cands = topk[i] if i < len(topk) else []
+        cls_str = " ".join([f"{ch}:{p:.3f}" for ch, p in cands]) if cands else "?"
+        lines.append(f"{j:02d} x1={x1:.1f} y1={y1:.1f} x2={x2:.1f} y2={y2:.1f} score={sc:.3f} | cls {cls_str}")
+    return "\n".join(lines)
+def make_annotated(pil: Image.Image, boxes_xyxy, top1_letters, scores):
+    img = pil.copy()
+    draw = ImageDraw.Draw(img)
+    for i, bb in enumerate(boxes_xyxy):
+        x1, y1, x2, y2 = [safe_int(v) for v in bb]
+        lab = top1_letters[i] if i < len(top1_letters) else "?"
+        sc = scores[i] if i < len(scores) else 0.0
+        draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
+        text_label = f"{i+1:02d}" + (f" {lab}" if lab != "?" else "")
+        draw.text((x1, y1), text_label, fill="red")
+    return img
+# =============================================================================
+# Tab Action Handlers
+# =============================================================================
+def run_pipeline_tab(pil, conf, iou, max_det, crop_pad, topk_k, rtl, output_mode, he2en_kind):
+    if pil is None: return None, [], "", ""
+    L = ensure_loaded()
+    boxes, det_scores = yolo_predict(L.det, pil, conf, iou, max_det)
+    if not boxes: return pil, [], "No boxes detected.", "No boxes detected."
+    topk_list, top1_letters, crop_gallery = classify_crops(L.cls, L.cls_letters, pil, boxes, pad=crop_pad, topk=topk_k)
+    src = build_source_text(boxes, det_scores, topk_list, rtl=rtl)
+    heb = ""
+    eng = ""
+    if output_mode == "he":
+        heb = mt5_generate(L.mt5_he_tok, L.mt5_he, src)
+    elif output_mode == "en_direct":
+        eng = mt5_generate(L.mt5_en_tok, L.mt5_en, src)
+    else: # he_then_en
+        heb = mt5_generate(L.mt5_he_tok, L.mt5_he, src)
+        if he2en_kind != "none":
+            eng = translate_he_to_en(heb, he2en_kind)
+        else:
+            eng = mt5_generate(L.mt5_en_tok, L.mt5_en, src)
+    annotated = make_annotated(pil, boxes, top1_letters, det_scores)
+    return annotated, crop_gallery, heb, eng
+def run_detector_tab(pil, conf, iou, max_det):
+    if pil is None: return None, "{}"
+    L = ensure_loaded()
+    boxes, scores = yolo_predict(L.det, pil, conf, iou, max_det)
+    annotated = make_annotated(pil, boxes, [], scores)
+    debug = {"num_boxes": len(boxes), "boxes_xyxy": boxes, "confidences": scores}
+    return annotated, json.dumps(debug, indent=2)
+def run_classifier_tab(pil, conf, iou, max_det, crop_pad, topk_k):
+    if pil is None: return [], "{}"
+    L = ensure_loaded()
+    boxes, _ = yolo_predict(L.det, pil, conf, iou, max_det)
+    if not boxes: return [], "No boxes found to classify."
+    topk_list, top1_letters, crop_gallery = classify_crops(L.cls, L.cls_letters, pil, boxes, pad=crop_pad, topk=topk_k)
+    details = {}
+    for i, (row, box) in enumerate(zip(topk_list, boxes)):
+        details[f"Box_{i+1:02d}"] = {
+            "top_predictions": dict(row),
+            "coordinates": box
+        }
+    return crop_gallery, json.dumps(details, ensure_ascii=False, indent=2)
+def save_feedback(heb_pred, eng_pred, heb_corr, eng_corr, notes):
+    rec = {
+        "ts": now_ts(),
+        "heb_pred": heb_pred, "eng_pred": eng_pred,
+        "heb_corr": heb_corr, "eng_corr": eng_corr,
+        "notes": notes,
+    }
+    with open(FEEDBACK_PATH, "a", encoding="utf-8") as f:
+        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    return f"Saved to {FEEDBACK_PATH}"
+# =============================================================================
+# UI
+# =============================================================================
+if EXAMPLES_DIR.exists():
+    gr.set_static_paths([str(EXAMPLES_DIR)])
+def list_example_images(max_n: int = 24):
+    if not EXAMPLES_DIR.exists(): return []
+    paths = [p for p in sorted(EXAMPLES_DIR.iterdir()) if p.suffix.lower() in {".png", ".jpg", ".jpeg", ".webp"}]
+    return [[str(p)] for p in paths[:max_n]]
+with gr.Blocks(title="Paleo-Hebrew Tablet Reader") as demo:
+    gr.Markdown("# Paleo-Hebrew Epigraphy Pipeline")
+    with gr.Row():
+        # LEFT COLUMN: Global Inputs & Settings
+        with gr.Column(scale=3):
+            inp = gr.Image(type="pil", label="Input Image")
+            with gr.Accordion("Vision Settings", open=True):
+                conf = gr.Slider(0.05, 0.95, value=0.25, step=0.01, label="YOLO conf")
+                iou = gr.Slider(0.10, 0.90, value=0.45, step=0.01, label="YOLO IoU")
+                max_det = gr.Slider(1, 200, value=80, step=1, label="Max Detections")
+                crop_pad = gr.Slider(0.0, 0.8, value=0.20, step=0.01, label="Crop Pad")
+                topk_k = gr.Slider(1, 10, value=5, step=1, label="Classifier Top-K")
+            with gr.Accordion("Translation Settings", open=True):
+                rtl = gr.Checkbox(value=True, label="RTL order (right→left)")
+                output_mode = gr.Radio(["he", "en_direct", "he_then_en"], value="he_then_en", label="Output mode")
+                he2en_kind = gr.Dropdown(choices=TRANSLATOR_CHOICES, value="opus", label="Hebrew → English translator")
+            ex = list_example_images()
+            if ex:
+                gr.Examples(examples=ex, inputs=[inp], cache_examples=False)
+        # RIGHT COLUMN: Tabs for different pipeline stages
+        with gr.Column(scale=7):
+            with gr.Tabs():
+                # TAB 1: Main E2E Pipeline
+                with gr.Tab("End-to-End Pipeline"):
+                    run_pipe_btn = gr.Button("Run Full Pipeline", variant="primary")
+                    out_annot_pipe = gr.Image(type="pil", label="Detections (bbox + top1)")
+                    with gr.Row():
+                        out_he_pipe = gr.Textbox(label="Hebrew Output", lines=3, interactive=False)
+                        out_en_pipe = gr.Textbox(label="English Output", lines=3, interactive=False)
+                    out_crops_pipe = gr.Gallery(label="Letter crops").style(grid=6, height="auto") if hasattr(gr.Gallery, 'style') else gr.Gallery(label="Letter crops")
+                # TAB 2: Detector Only
+                with gr.Tab("Detector (YOLO)"):
+                    run_det_btn = gr.Button("Run Detector", variant="primary")
+                    out_annot_det = gr.Image(type="pil", label="Detected Bounding Boxes")
+                    out_json_det = gr.Code(label="Raw Detection Output", language="json")
+                # TAB 3: Classifier Only
+                with gr.Tab("Classifier (ConvNeXt)"):
+                    run_cls_btn = gr.Button("Run Classifier", variant="primary")
+                    out_crops_cls = gr.Gallery(label="Isolated Letter Crops").style(grid=6, height="auto") if hasattr(gr.Gallery, 'style') else gr.Gallery(label="Isolated Letter Crops")
+                    out_json_cls = gr.Code(label="Top-K Probabilities per Box", language="json")
+                # TAB 4: Feedback
+                with gr.Tab("Feedback"):
+                    gr.Markdown("Submit corrections to improve the dataset.")
+                    heb_corr = gr.Textbox(label="Correct Hebrew", lines=2)
+                    eng_corr = gr.Textbox(label="Correct English", lines=2)
+                    notes = gr.Textbox(label="Notes (e.g., missed boxes, ambiguous glyphs)", lines=2)
+                    save_btn = gr.Button("Submit Feedback")
+                    save_status = gr.Textbox(label="Status", interactive=False)
+    # --- Button Clicks ---
+    run_pipe_btn.click(
+        fn=run_pipeline_tab,
+        inputs=[inp, conf, iou, max_det, crop_pad, topk_k, rtl, output_mode, he2en_kind],
+        outputs=[out_annot_pipe, out_crops_pipe, out_he_pipe, out_en_pipe],
+        api_name=False
+    )
+    run_det_btn.click(
+        fn=run_detector_tab,
+        inputs=[inp, conf, iou, max_det],
+        outputs=[out_annot_det, out_json_det],
+        api_name=False
+    )
+    run_cls_btn.click(
+        fn=run_classifier_tab,
+        inputs=[inp, conf, iou, max_det, crop_pad, topk_k],
+        outputs=[out_crops_cls, out_json_cls],
+        api_name=False
+    )
+    save_btn.click(
+        fn=save_feedback,
+        inputs=[out_he_pipe, out_en_pipe, heb_corr, eng_corr, notes],
+        outputs=[save_status],
+        api_name=False
+    )
+demo.queue(max_size=32).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio>=4.0.0
+huggingface_hub>=0.20.0
+transformers>=4.40.0
+sentencepiece
+torch
+numpy
+Pillow
+ultralytics>=8.0.0
+onnxruntime
+timm
+safetensors