Spaces:

JW-Sidhom-Lab
/

tessera

Runtime error

App Files Files Community

sidhomj commited on May 7

Commit

31cb7f9

verified ·

1 Parent(s): d2dc073

Add app.py

Browse files

Files changed (1) hide show

app.py +743 -0

app.py ADDED Viewed

	@@ -0,0 +1,743 @@

+"""TESSERA inference API - Gradio app.
+Accepts SNV CSV, CNA CSV, or both. Auto-pads the missing modality with a
+single neutral placeholder per sample (the joint InfoNCE-noLOH model has
+no cross-modal information flow at the per-token level, so per-modality
+outputs are bit-identical to a true single-modality run). Auto-selects
+the with-LoH vs without-LoH joint model based on whether the CNA CSV
+carries a LOH column.
+Returns a ZIP with per-token features, masked-token reconstruction
+predictions, a JSON summary, and intrinsic confidence metrics:
+  - SNV masked-token accuracy (per-sample + cohort)
+  - CNA segment-mean Spearman correlation (per-sample + cohort)
+These are computed for whichever modality the user actually uploaded,
+and tell the user how confident the model is in its own embeddings on
+their data distribution.
+CSV column conventions:
+  SNV: Tumor_Sample_Barcode, Chromosome, Start_Position,
+       Reference_Allele, Tumor_Seq_Allele2,
+       and either `vaf` or both `t_alt_count` and `t_ref_count`.
+  CNA: Tumor_Sample_Barcode, Chromosome, Start, End, Segment_Mean,
+       optional LOH (0/1; presence triggers the with-LoH model).
+Sample cap: 100 per request. Larger cohorts: run inference locally with
+the same code path (see inference_api/benchmark_local.py).
+"""
+from __future__ import annotations
+import io
+import json
+import os
+import pickle
+import sys
+import tempfile
+import time
+import zipfile
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import pandas as pd
+from scipy import stats
+from scipy.stats import rankdata
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+def _ensure_grch37_fasta() -> None:
+    """Place the GRCh37 reference FASTA inside the installed tessera
+    package on first boot. The Space cannot ship the FASTA itself
+    (~3 GB unpacked); pyfaidx + the SNV encoder need it for sequence
+    context lookups, so we lazy-fetch from NCBI here.
+    """
+    import gzip, shutil, urllib.request
+    import tessera.ref_genomes as _rg
+    ref_dir = Path(_rg.__file__).parent
+    fasta = ref_dir / "GCF_000001405.25_GRCh37.p13_genomic.fna"
+    if fasta.exists() and fasta.stat().st_size > 1_000_000_000:
+        print(f"[boot] reference FASTA already present ({fasta.stat().st_size / 1e9:.2f} GB)", flush=True)
+        return
+    url = ("https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/"
+           "GCF_000001405.25_GRCh37.p13/"
+           "GCF_000001405.25_GRCh37.p13_genomic.fna.gz")
+    gz_path = fasta.with_suffix(".fna.gz")
+    print(f"[boot] downloading GRCh37 FASTA from NCBI (~900 MB compressed)...", flush=True)
+    t0 = time.time()
+    urllib.request.urlretrieve(url, gz_path)
+    print(f"[boot]   downloaded {gz_path.stat().st_size / 1e6:.0f} MB in {time.time()-t0:.0f}s", flush=True)
+    print(f"[boot] decompressing -> {fasta}", flush=True)
+    t0 = time.time()
+    with gzip.open(gz_path, "rb") as fin, open(fasta, "wb") as fout:
+        shutil.copyfileobj(fin, fout, length=8 * 1024 * 1024)
+    gz_path.unlink()
+    print(f"[boot]   decompressed to {fasta.stat().st_size / 1e9:.2f} GB in {time.time()-t0:.0f}s", flush=True)
+_ensure_grch37_fasta()
+from tessera.model import TESSERA
+import tessera.layers.pooling  # noqa: F401  ensure CreateMaskLayer is registered
+# ----------------------------------------------------------------------------
+# Configuration
+# ----------------------------------------------------------------------------
+HERE = Path(__file__).resolve().parent
+MODEL_DIR_NOLOH = ROOT / "scripts" / "tcga_pancan_snv_cna" / "models" / "TCGA_SNV_CNA_InfoNCE_per_sample_loss_noLOH"
+MODEL_DIR_LOH   = ROOT / "scripts" / "tcga_pancan_snv_cna" / "models" / "TCGA_SNV_CNA_InfoNCE_per_sample_loss"
+TCGA_CNA_SORTED = HERE / "cna_sorted.npy"
+LIFTOVER_CHAIN  = HERE / "hg38ToHg19.over.chain.gz"
+# Hugging Face Hub fallback. When the local model directory (above) does not
+# exist - the case in any clean checkout, including Hugging Face Spaces
+# containers - tessera.hub.download_pretrained pulls the corresponding
+# subdirectory from huggingface.co/JW-Sidhom-Lab/tessera-foundation at
+# startup. Override the repo via TESSERA_HUB_REPO if needed.
+HUB_REPO_ID       = os.environ.get("TESSERA_HUB_REPO", "JW-Sidhom-Lab/tessera-foundation")
+HUB_VARIANT_NOLOH = "joint_snv_cna_noloh"
+HUB_VARIANT_LOH   = "joint_snv_cna"
+CONTEXT_LEN = 25
+BATCH_SIZE = 24
+MAX_SAMPLES_PER_REQUEST = 1000
+# Heuristic: rough wall-clock per sample on Mac CPU (similar to Spaces free-tier CPU).
+# Measured: 950 TCGA WES samples = 570s -> 0.6 s/sample; n=2000 MSK panel = 110s -> 0.05 s/sample.
+SECS_PER_SAMPLE_PANEL = 0.05
+SECS_PER_SAMPLE_WES   = 0.6
+import re
+EMAIL_RE = re.compile(r"^[\w\.\-+]+@[\w\.\-]+\.\w+$")
+# ----------------------------------------------------------------------------
+# Model + reference data loaded once at startup
+# ----------------------------------------------------------------------------
+print("Loading TCGA CNA reference distribution...", flush=True)
+TCGA_SORTED = np.load(TCGA_CNA_SORTED)
+print(f"  {len(TCGA_SORTED):,} TCGA segment anchors", flush=True)
+_models: dict[bool, TESSERA] = {}
+def _resolve_model_dir(local_path, hub_variant: str) -> str:
+    """Prefer the local checkpoint if it's present (development); otherwise
+    pull the matching variant from the Hugging Face Hub via
+    tessera.hub.download_pretrained (cached under ~/.cache/huggingface/hub/
+    on subsequent calls)."""
+    if local_path.exists():
+        print(f"  resolved {hub_variant} -> local {local_path}", flush=True)
+        return str(local_path)
+    print(f"  resolved {hub_variant} -> pulling from {HUB_REPO_ID} on the Hub ...", flush=True)
+    from tessera.hub import download_pretrained
+    return download_pretrained(variant=hub_variant, repo_id=HUB_REPO_ID)
+def get_model(use_loh: bool) -> TESSERA:
+    if use_loh not in _models:
+        local_path    = MODEL_DIR_LOH if use_loh else MODEL_DIR_NOLOH
+        hub_subfolder = HUB_VARIANT_LOH if use_loh else HUB_VARIANT_NOLOH
+        model_dir = _resolve_model_dir(local_path, hub_subfolder)
+        print(f"Loading TESSERA ({'with-LoH' if use_loh else 'noLoH'}) from {model_dir} ...", flush=True)
+        _models[use_loh] = TESSERA(
+            model_dir=model_dir,
+            use_distributed=False,
+            jit_compile=False,
+            mixed_precision=False,
+        )
+    return _models[use_loh]
+# ----------------------------------------------------------------------------
+# Validation
+# ----------------------------------------------------------------------------
+SNV_REQUIRED = ["Tumor_Sample_Barcode", "Chromosome", "Start_Position",
+                  "Reference_Allele", "Tumor_Seq_Allele2"]
+CNA_REQUIRED = ["Tumor_Sample_Barcode", "Chromosome", "Start", "End", "Segment_Mean"]
+VALID_BASES = {"A", "C", "G", "T"}
+def _resolve_columns(df: pd.DataFrame, required: list[str], optional: list[str] = ()) -> pd.DataFrame:
+    """Case-insensitive column matching, rename to canonical names."""
+    lower_to_orig = {c.lower(): c for c in df.columns}
+    out = df.copy()
+    rename = {}
+    missing = []
+    for col in required:
+        if col.lower() in lower_to_orig:
+            rename[lower_to_orig[col.lower()]] = col
+        else:
+            missing.append(col)
+    if missing:
+        raise ValueError(f"Missing required column(s): {missing}. Got columns: {list(df.columns)}")
+    for col in optional:
+        if col.lower() in lower_to_orig:
+            rename[lower_to_orig[col.lower()]] = col
+    return out.rename(columns=rename)
+def validate_snv(df: pd.DataFrame) -> pd.DataFrame:
+    if df is None or len(df) == 0:
+        raise ValueError("SNV CSV is empty (no rows).")
+    df = _resolve_columns(df, SNV_REQUIRED, optional=["vaf", "t_alt_count", "t_ref_count"])
+    df["Tumor_Sample_Barcode"] = df["Tumor_Sample_Barcode"].astype(str).str.strip()
+    df["Chromosome"] = (
+        df["Chromosome"].astype(str).str.strip()
+            .str.replace(r"^chr", "", regex=True, case=False)
+    )
+    df["Start_Position"] = pd.to_numeric(df["Start_Position"], errors="coerce")
+    n_bad = int(df["Start_Position"].isna().sum())
+    if n_bad:
+        raise ValueError(f"SNV CSV has {n_bad} rows with non-integer Start_Position.")
+    df["Start_Position"] = df["Start_Position"].astype(int)
+    df["Reference_Allele"] = df["Reference_Allele"].astype(str).str.strip().str.upper()
+    df["Tumor_Seq_Allele2"] = df["Tumor_Seq_Allele2"].astype(str).str.strip().str.upper()
+    if "vaf" in df.columns:
+        df["vaf"] = pd.to_numeric(df["vaf"], errors="coerce")
+    elif {"t_alt_count", "t_ref_count"}.issubset(df.columns):
+        alt = pd.to_numeric(df["t_alt_count"], errors="coerce")
+        ref = pd.to_numeric(df["t_ref_count"], errors="coerce")
+        df["vaf"] = alt / (alt + ref)
+    else:
+        raise ValueError(
+            "SNV CSV needs either a 'vaf' column or both 't_alt_count' and "
+            "'t_ref_count' so VAF can be computed."
+        )
+    df["vaf"] = df["vaf"].fillna(0).replace([np.inf, -np.inf], 0).clip(0.0, 1.0)
+    n_in = len(df)
+    valid = (
+        df["Reference_Allele"].isin(VALID_BASES)
+        & df["Tumor_Seq_Allele2"].isin(VALID_BASES)
+    )
+    n_indels = int((~valid).sum())
+    df = df.loc[valid].reset_index(drop=True)
+    if df.empty:
+        raise ValueError(
+            f"All {n_in} SNV rows are non-substitutions (indels, multi-base, or "
+            "non-A/C/G/T alleles). TESSERA only scores single-base substitutions. "
+            "Filter your input first."
+        )
+    if n_indels:
+        print(f"  validate_snv: dropped {n_indels:,} non-substitution rows "
+              f"({n_indels/n_in*100:.1f}%); kept {len(df):,}", flush=True)
+    return df
+def validate_cna(df: pd.DataFrame) -> pd.DataFrame:
+    if df is None or len(df) == 0:
+        raise ValueError("CNA CSV is empty (no rows).")
+    df = _resolve_columns(df, CNA_REQUIRED, optional=["LOH"])
+    df["Tumor_Sample_Barcode"] = df["Tumor_Sample_Barcode"].astype(str).str.strip()
+    df["Chromosome"] = (
+        df["Chromosome"].astype(str).str.strip()
+            .str.replace(r"^chr", "", regex=True, case=False)
+    )
+    for col in ("Start", "End"):
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+        n_bad = int(df[col].isna().sum())
+        if n_bad:
+            raise ValueError(f"CNA CSV has {n_bad} rows with non-integer {col}.")
+        df[col] = df[col].astype(int)
+    bad = df["Start"] > df["End"]
+    if bad.any():
+        raise ValueError(f"CNA CSV has {int(bad.sum())} rows where Start > End.")
+    df["Segment_Mean"] = pd.to_numeric(df["Segment_Mean"], errors="coerce")
+    n_nan = int(df["Segment_Mean"].isna().sum())
+    if n_nan:
+        raise ValueError(f"CNA CSV has {n_nan} rows with non-numeric or missing Segment_Mean.")
+    if "LOH" in df.columns:
+        loh_raw = df["LOH"]
+        # Accept 0/1, True/False, "0"/"1", "True"/"False"
+        coerced = pd.to_numeric(loh_raw.astype(str).str.lower()
+                                  .replace({"true": "1", "false": "0",
+                                              "yes": "1", "no": "0"}),
+                                  errors="coerce")
+        n_bad = int(coerced.isna().sum() - loh_raw.isna().sum())
+        if n_bad:
+            raise ValueError(
+                f"CNA LOH column has {n_bad} rows with values that aren't 0/1 "
+                "(or True/False / yes/no)."
+            )
+        df["LOH"] = coerced.fillna(0).astype(int).clip(0, 1)
+    return df
+def quantile_normalize_to_tcga(vals: np.ndarray) -> np.ndarray:
+    n = len(vals)
+    ranks = rankdata(vals, method="average")
+    q = (ranks - 0.5) / n
+    tcga_q = np.linspace(0.0, 1.0, len(TCGA_SORTED))
+    return np.interp(q, tcga_q, TCGA_SORTED).astype(np.float32)
+# ----------------------------------------------------------------------------
+# hg38 -> hg19 liftover (TESSERA was trained on TCGA in GRCh37/hg19, so any
+# input in another assembly must be lifted before inference). The actual
+# liftover is implemented in tessera.data.liftover; we only point it at the
+# bundled chain file so the Spaces runtime never has to hit UCSC.
+# ----------------------------------------------------------------------------
+if LIFTOVER_CHAIN.exists():
+    os.environ.setdefault("TESSERA_LIFTOVER_CHAIN", str(LIFTOVER_CHAIN))
+from tessera import lift_snv, lift_cna   # noqa: E402  (after env var is set)
+# ----------------------------------------------------------------------------
+# Inference
+# ----------------------------------------------------------------------------
+def make_dummy_snv(sample_ids: list[str]) -> pd.DataFrame:
+    return pd.DataFrame({
+        "Tumor_Sample_Barcode": sample_ids,
+        "Chromosome": ["17"] * len(sample_ids),
+        "Start_Position": [7577538] * len(sample_ids),
+        "Reference_Allele": ["G"] * len(sample_ids),
+        "Tumor_Seq_Allele2": ["A"] * len(sample_ids),
+        "vaf": [0.5] * len(sample_ids),
+    })
+def make_dummy_cna(sample_ids: list[str]) -> tuple[pd.DataFrame, np.ndarray]:
+    df = pd.DataFrame({
+        "Tumor_Sample_Barcode": sample_ids,
+        "Chromosome": ["1"] * len(sample_ids),
+        "Start": [1] * len(sample_ids),
+        "End": [1_000_000] * len(sample_ids),
+        "Segment_Mean": [0.0] * len(sample_ids),
+    })
+    return df, np.zeros(len(sample_ids), dtype=np.float32)
+def run_inference(snv_df: pd.DataFrame | None, cna_df: pd.DataFrame | None,
+                    apply_qn: bool) -> dict:
+    have_snv = snv_df is not None and not snv_df.empty
+    have_cna = cna_df is not None and not cna_df.empty
+    if not (have_snv or have_cna):
+        raise ValueError("Upload at least one of SNV or CNA.")
+    sample_ids = set()
+    if have_snv:
+        sample_ids.update(snv_df["Tumor_Sample_Barcode"].unique())
+    if have_cna:
+        sample_ids.update(cna_df["Tumor_Sample_Barcode"].unique())
+    sample_ids = sorted(sample_ids)
+    if len(sample_ids) > MAX_SAMPLES_PER_REQUEST:
+        raise ValueError(f"Sample cap is {MAX_SAMPLES_PER_REQUEST} per request; "
+                          f"got {len(sample_ids)}. Run locally for larger cohorts.")
+    use_loh = have_cna and "LOH" in cna_df.columns and cna_df["LOH"].notna().any()
+    # Pad missing modality (model graph requires both input branches)
+    if not have_snv:
+        snv_df_full = make_dummy_snv(sample_ids)
+    else:
+        snv_df_full = snv_df
+    if not have_cna:
+        cna_df_full, cna_seg_mean = make_dummy_cna(sample_ids)
+        cna_lohs = None
+    else:
+        cna_df_full = cna_df
+        raw_seg = cna_df_full["Segment_Mean"].astype(float).values
+        if apply_qn and len(raw_seg) > 0:
+            cna_seg_mean = quantile_normalize_to_tcga(raw_seg)
+        else:
+            cna_seg_mean = raw_seg.astype(np.float32)
+        cna_lohs = (cna_df_full["LOH"].fillna(0).astype(int).values
+                      if use_loh else None)
+    model = get_model(use_loh)
+    name = f"api_{int(time.time() * 1000)}"
+    model.create_sample_dataset(
+        sample_ids=snv_df_full["Tumor_Sample_Barcode"].values,
+        chromosomes=snv_df_full["Chromosome"].astype(str).values,
+        positions=snv_df_full["Start_Position"].astype(int).values,
+        refs=snv_df_full["Reference_Allele"].values,
+        alts=snv_df_full["Tumor_Seq_Allele2"].values,
+        vaf=snv_df_full["vaf"].values,
+        context_len=CONTEXT_LEN,
+        batch_size=BATCH_SIZE,
+        name=name,
+        is_training=False,
+        fixed_bag_size=True,
+        ref_len=1,
+        alt_len=1,
+        cna_sample_ids=cna_df_full["Tumor_Sample_Barcode"].values,
+        cna_chromosomes=cna_df_full["Chromosome"].astype(str).values,
+        cna_starts=cna_df_full["Start"].astype(int).values,
+        cna_ends=cna_df_full["End"].astype(int).values,
+        cna_segment_means=cna_seg_mean,
+        cna_lohs=cna_lohs,
+        z_score_cna=False,
+        z_score_clip=None,
+    )
+    out = {
+        "n_samples": len(sample_ids),
+        "sample_ids": sample_ids,
+        "model_variant": "InfoNCE_per_sample_loss" + ("" if use_loh else "_noLOH"),
+        "snv_uploaded": have_snv,
+        "cna_uploaded": have_cna,
+        "qn_applied": (have_cna and apply_qn),
+    }
+    if have_snv:
+        out["variant_features"] = model.get_variant_features(name, downcast=False)
+        snv_probs, _ = model.get_variant_probabilities(
+            name, return_logits=False, return_true_values=True,
+            return_loss=False, non_zero_only=False, return_ref=False,
+        )
+        out["variant_probabilities"] = snv_probs
+    if have_cna:
+        out["cna_features"] = model.get_cna_features(name, downcast=False)
+        cna_pred, _ = model.get_cna_predictions(
+            name, return_true_values=True, return_loh=False,
+        )
+        out["cna_predictions"] = cna_pred
+    return out
+# ----------------------------------------------------------------------------
+# Pack outputs into a ZIP for download
+# ----------------------------------------------------------------------------
+def pack_outputs(result: dict) -> str:
+    tmp_dir = Path(tempfile.mkdtemp(prefix="tessera_"))
+    zip_path = tmp_dir / "tessera_results.zip"
+    summary = {k: v for k, v in result.items()
+                 if k not in ("variant_features", "variant_probabilities",
+                              "cna_features", "cna_predictions")}
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+        for key in ("variant_features", "variant_probabilities",
+                    "cna_features", "cna_predictions"):
+            if key in result:
+                arr = np.asarray(result[key])
+                buf = io.BytesIO()
+                np.save(buf, arr)
+                zf.writestr(f"{key}.npy", buf.getvalue())
+        zf.writestr("summary.json", json.dumps(summary, indent=2, default=str))
+    return str(zip_path)
+# ----------------------------------------------------------------------------
+# Pretty HTML summary for the Gradio UI
+# ----------------------------------------------------------------------------
+def render_summary_html(result: dict) -> str:
+    rows = [
+        f"<b>Samples:</b> {result['n_samples']}",
+        f"<b>Model:</b> {result['model_variant']}",
+        f"<b>SNV uploaded:</b> {result['snv_uploaded']}",
+        f"<b>CNA uploaded:</b> {result['cna_uploaded']}",
+    ]
+    if result["cna_uploaded"]:
+        rows.append(f"<b>CNA quantile-normalized:</b> {result['qn_applied']}")
+    return "<div style='font-family: sans-serif'>" + "<br>".join(rows) + "</div>"
+# ----------------------------------------------------------------------------
+# Gradio entry point
+# ----------------------------------------------------------------------------
+def _read_csv_safe(path: str, label: str) -> pd.DataFrame:
+    try:
+        return pd.read_csv(path)
+    except pd.errors.EmptyDataError:
+        raise ValueError(f"{label} file is empty.")
+    except pd.errors.ParserError as e:
+        raise ValueError(
+            f"{label} file could not be parsed as CSV. Check that it's "
+            f"comma-separated (TSV / Excel files aren't supported). Pandas "
+            f"error: {e}"
+        )
+    except UnicodeDecodeError:
+        raise ValueError(
+            f"{label} file appears to be binary (e.g., an Excel .xlsx). "
+            f"Please save it as a CSV first."
+        )
+def _render_error_html(msg: str) -> str:
+    return (
+        "<div style='color:#7a0014; padding:14px; background:#fde7ea; "
+        "border:1px solid #f5c2c7; border-radius:8px; "
+        "font-family: sans-serif; line-height: 1.4;'>"
+        "<b style='color:#7a0014;'>Input error.</b><br>"
+        f"<span style='color:#7a0014;'>{msg}</span></div>"
+    )
+def _render_queued_html(job_id: str, n: int, email: str, est_min: int,
+                          liftover_note: str = "") -> str:
+    return (
+        "<div style='color:#0b3a66; padding:14px; background:#e8f4ff; "
+        "border:1px solid #b6d8ff; border-radius:8px; "
+        "font-family: sans-serif; line-height: 1.5;'>"
+        f"<b style='color:#0b3a66;'>&#10003; Job queued.</b> "
+        f"ID: <code style='color:#0b3a66;'>{job_id}</code><br>"
+        f"<b style='color:#0b3a66;'>{n}</b> sample(s); estimated wait "
+        f"<b style='color:#0b3a66;'>~{est_min} min</b>.<br>"
+        f"We'll email <code style='color:#0b3a66;'>{email}</code> with a "
+        f"download link when it's ready (link valid 24 hours)."
+        f"{liftover_note}"
+        "</div>"
+    )
+def _estimate_minutes(snv_df, n_samples: int) -> int:
+    """Pick the per-sample heuristic based on input shape."""
+    if snv_df is not None and len(snv_df) > 0:
+        median_per_sample = int(snv_df.groupby("Tumor_Sample_Barcode").size().median())
+        per = SECS_PER_SAMPLE_WES if median_per_sample > 50 else SECS_PER_SAMPLE_PANEL
+    else:
+        per = SECS_PER_SAMPLE_PANEL
+    return max(1, round(n_samples * per / 60))
+def submit_async(snv_file, cna_file, apply_qn: bool, email: str, assembly: str):
+    """Validate, lift over (if needed), enqueue an async job, return a
+    Queued/Error panel."""
+    try:
+        if not email or not EMAIL_RE.match(email.strip()):
+            raise ValueError("Please enter a valid email address.")
+        if assembly not in ("GRCh37", "GRCh38"):
+            raise ValueError(f"Unrecognised genome assembly {assembly!r}; pick GRCh37 or GRCh38.")
+        snv_df = _read_csv_safe(snv_file.name, "SNV CSV") if snv_file is not None else None
+        cna_df = _read_csv_safe(cna_file.name, "CNA CSV") if cna_file is not None else None
+        if snv_df is None and cna_df is None:
+            raise ValueError("Upload at least one of SNV or CNA CSV.")
+        if snv_df is not None:
+            try:
+                snv_df = validate_snv(snv_df)
+            except ValueError as e:
+                raise ValueError(f"SNV CSV: {e}")
+        if cna_df is not None:
+            try:
+                cna_df = validate_cna(cna_df)
+            except ValueError as e:
+                raise ValueError(f"CNA CSV: {e}")
+        liftover_note = ""
+        if assembly == "GRCh38":
+            parts = []
+            if snv_df is not None:
+                snv_df, snv_stats = lift_snv(snv_df, from_assembly="GRCh38")
+                parts.append(f"SNV {snv_stats['n_out']}/{snv_stats['n_in']}")
+                if snv_df.empty:
+                    raise ValueError("All SNV rows failed to lift from GRCh38 to GRCh37; check input.")
+            if cna_df is not None:
+                cna_df, cna_stats = lift_cna(cna_df, from_assembly="GRCh38")
+                parts.append(f"CNA {cna_stats['n_out']}/{cna_stats['n_in']}")
+                if cna_df.empty:
+                    raise ValueError("All CNA segments failed to lift from GRCh38 to GRCh37; check input.")
+            liftover_note = f"<br>Lifted GRCh38&rarr;GRCh37: " + ", ".join(parts) + "."
+        sample_set = set()
+        if snv_df is not None:
+            sample_set.update(snv_df["Tumor_Sample_Barcode"].tolist())
+        if cna_df is not None:
+            sample_set.update(cna_df["Tumor_Sample_Barcode"].tolist())
+        n = len(sample_set)
+        if n > MAX_SAMPLES_PER_REQUEST:
+            raise ValueError(
+                f"Sample cap is {MAX_SAMPLES_PER_REQUEST} per request; got {n}. "
+                "Run inference locally for larger cohorts."
+            )
+        from jobs import submit_job
+        est_min = _estimate_minutes(snv_df, n)
+        job_id = submit_job(snv_df, cna_df, apply_qn, email.strip(), n)
+        return _render_queued_html(job_id, n, email.strip(), est_min, liftover_note), job_id
+    except ValueError as e:
+        return _render_error_html(str(e)), ""
+def get_status(job_id: str) -> dict:
+    """API endpoint for Python clients polling job state.
+    Returns a JSON-serialisable dict with the job's current status, the
+    pre-signed download URL once finished, and any error message. The
+    download URL here is the same one delivered by email; clients can use
+    either path to retrieve results.
+    """
+    if not job_id or not isinstance(job_id, str):
+        return {"status": "not_found"}
+    from jobs import get_job
+    row = get_job(job_id.strip())
+    if row is None:
+        return {"status": "not_found"}
+    return {
+        "status":      row["status"],
+        "url":         row["result_url"],
+        "error":       row["error"],
+        "n_samples":   row["n_samples"],
+        "created_at":  row["created_at"],
+        "finished_at": row["finished_at"],
+    }
+def warmup() -> None:
+    """Run a tiny inference at startup so the first user request doesn't pay
+    the 2-3 s graph-compilation cost."""
+    print("Warming up the noLoH model...", flush=True)
+    snv_df = make_dummy_snv(["WARMUP"])
+    cna_df, cna_seg_mean = make_dummy_cna(["WARMUP"])
+    model = get_model(use_loh=False)
+    name = "warmup"
+    model.create_sample_dataset(
+        sample_ids=snv_df["Tumor_Sample_Barcode"].values,
+        chromosomes=snv_df["Chromosome"].astype(str).values,
+        positions=snv_df["Start_Position"].astype(int).values,
+        refs=snv_df["Reference_Allele"].values,
+        alts=snv_df["Tumor_Seq_Allele2"].values,
+        vaf=snv_df["vaf"].values,
+        context_len=CONTEXT_LEN, batch_size=BATCH_SIZE, name=name,
+        is_training=False, fixed_bag_size=True, ref_len=1, alt_len=1,
+        cna_sample_ids=cna_df["Tumor_Sample_Barcode"].values,
+        cna_chromosomes=cna_df["Chromosome"].astype(str).values,
+        cna_starts=cna_df["Start"].astype(int).values,
+        cna_ends=cna_df["End"].astype(int).values,
+        cna_segment_means=cna_seg_mean,
+        cna_lohs=None, z_score_cna=False, z_score_clip=None,
+    )
+    _ = model.get_variant_features(name, downcast=False)
+    _ = model.get_cna_features(name, downcast=False)
+    print("Warmup complete.", flush=True)
+import base64
+LOGO_PATH = ROOT / "logo.png"
+with open(LOGO_PATH, "rb") as _logo_fh:
+    LOGO_DATA_URI = "data:image/png;base64," + base64.b64encode(_logo_fh.read()).decode("ascii")
+THEME = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="orange",
+    font=("Inter", "system-ui", "sans-serif"),
+)
+CSS = """
+.gradio-container {
+    max-width: 1100px !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+#tessera-header {text-align: center; padding: 24px 0 8px 0;}
+#tessera-header img {max-height: 220px; width: auto; margin: 0 auto;}
+#tessera-tagline {text-align: center; color: #888; font-style: italic;
+                    margin: 4px 0 22px 0;}
+"""
+with gr.Blocks(theme=THEME, title="TESSERA inference API", css=CSS) as demo:
+    gr.HTML(
+        f'<div id="tessera-header">'
+        f'<img src="{LOGO_DATA_URI}" alt="TESSERA">'
+        f'</div>'
+        '<p id="tessera-tagline">Tumour Embeddings via Self-Supervised Encoding '
+        'and Reconstruction of Alterations</p>'
+    )
+    gr.Markdown(
+        "Upload a **SNV** CSV, a **CNA** CSV, or both. We'll run inference and "
+        "**email you a download link** when the results are ready (link valid "
+        f"24 hours). **Cap: {MAX_SAMPLES_PER_REQUEST} samples per request.**"
+    )
+    gr.Markdown(
+        "### Required CSV columns\n"
+        "**SNV CSV**: `Tumor_Sample_Barcode`, `Chromosome` (string, no `chr` "
+        "prefix), `Start_Position`, `Reference_Allele`, `Tumor_Seq_Allele2`, "
+        "plus either `vaf` or both `t_alt_count` and `t_ref_count`. Only "
+        "single-base substitutions are scored.<br>"
+        "**CNA CSV**: `Tumor_Sample_Barcode`, `Chromosome`, `Start`, `End`, "
+        "`Segment_Mean` (log2 ratio relative to copy-number 2). Optional "
+        "`LOH` (0/1) triggers the with-LoH model variant."
+    )
+    with gr.Row(equal_height=True):
+        snv = gr.File(label="SNV CSV (optional)", file_types=[".csv"])
+        cna = gr.File(label="CNA CSV (optional)", file_types=[".csv"])
+    assembly = gr.Dropdown(
+        label="Genome assembly of your input coordinates",
+        choices=["GRCh37", "GRCh38"],
+        value="GRCh37",
+        info="GRCh37 (hg19) is the model's native assembly; GRCh38 (hg38) "
+              "uploads are lifted to GRCh37 before inference.",
+    )
+    apply_qn = gr.Checkbox(
+        label="Apply TCGA quantile normalization to CNA Segment_Mean",
+        value=True,
+        info="Maps your input distribution onto the TCGA training distribution. "
+              "Recommended for cross-platform / out-of-distribution input.",
+    )
+    email_input = gr.Textbox(
+        label="Email address",
+        placeholder="you@example.com",
+        info="We'll send your download link here when the job is ready.",
+    )
+    submit = gr.Button("Submit inference job", variant="primary", size="lg")
+    status_html = gr.HTML()
+    # Hidden API surface: returns the job_id as a plain string alongside
+    # the human-readable HTML panel, so Python clients can poll without
+    # having to regex-extract the ID from the HTML.
+    job_id_out = gr.Textbox(label="Job ID", visible=False)
+    submit.click(
+        submit_async,
+        inputs=[snv, cna, apply_qn, email_input, assembly],
+        outputs=[status_html, job_id_out],
+        api_name="submit",
+    )
+    # Hidden status-polling endpoint exposed to the Gradio API only
+    # (no visible UI). Clients call it via api_name="/status".
+    _status_job_id  = gr.Textbox(visible=False)
+    _status_payload = gr.JSON(visible=False)
+    _status_trigger = gr.Button(visible=False)
+    _status_trigger.click(
+        get_status,
+        inputs=_status_job_id,
+        outputs=_status_payload,
+        api_name="status",
+    )
+    with gr.Accordion("Try a one-click example (5 TCGA validation samples)", open=False):
+        gr.Examples(
+            examples=[
+                [str(HERE / "example_snv.csv"), str(HERE / "example_cna.csv"), True, "GRCh37"],
+                [str(HERE / "example_snv.csv"), None,                            True, "GRCh37"],
+                [None,                            str(HERE / "example_cna.csv"), True, "GRCh37"],
+            ],
+            inputs=[snv, cna, apply_qn, assembly],
+            label=None,
+        )
+if __name__ == "__main__":
+    warmup()
+    demo.launch()