Grillcheese-AI
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/qc_data.py‎
Lines changed: 197 additions & 0 deletions b/‎benchmarks/qc_data.py‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎benchmarks/qc_embedding_separation.py‎
Lines changed: 108 additions & 0 deletions b/‎benchmarks/qc_embedding_separation.py‎
Lines changed: 108 additions & 0 deletions
@@ -130,3 +130,8 @@ model/cubby/_probe_stage3_baseline.py
 /sandbox/regen
 subdomain_taxonomy_patterns.jsonl
 taxonomy_detector.jsonl
+_audit.ps1
+_check_live_brain.py
+CUBEMIND_REFACTOR_PLAN.md
+QC_APP_PLAN.md
+QC_PERCEPTION_PLAN.md
@@ -0,0 +1,197 @@
+"""MVTec AD data access + anomaly-detection metrics for the QC perception gate.
+
+Reusable across phases: the Phase 0 baseline harness and the Phase 1/2 patch
+method both load data and compute metrics through here. No asserted targets —
+this module only measures.
+
+Dataset: `TheoM55/mvtec_anomaly_detection` (ungated HF mirror of MVTec AD).
+Layout (from `metadata.csv`):
+    columns = path, label, split, object, defect, mask_path
+    path      = "train/<object>/good/NNN.png" | "test/<object>/<defect>/NNN.png"
+    label     = 0 (good) | 1..8 (defect-type id, per object)
+    split     = "train" | "test"
+    object    = one of the 15 MVTec categories
+    defect    = "good" | "<defect_name>"
+    mask_path = "" for good, else path to the ground-truth defect mask
+"""
+
+from __future__ import annotations
+
+import csv
+import os
+from dataclasses import dataclass, field
+
+import numpy as np
+
+MVTEC_REPO = "TheoM55/mvtec_anomaly_detection"
+
+MVTEC_CATEGORIES = [
+    "bottle", "cable", "capsule", "carpet", "grid", "hazelnut", "leather",
+    "metal_nut", "pill", "screw", "tile", "toothbrush", "transistor", "wood",
+    "zipper",
+]
+
+
+@dataclass
+class Sample:
+    """One MVTec image record (paths only; pixels loaded on demand)."""
+
+    image_path: str
+    label: int          # 0 = good, 1 = anomalous (binary)
+    defect: str         # "good" or defect-type name
+    mask_path: str | None  # ground-truth mask path, or None for good
+
+
+@dataclass
+class CategoryData:
+    """Train (good-only) + test (good+defect) split for one MVTec category."""
+
+    category: str
+    train: list[Sample] = field(default_factory=list)   # all good
+    test: list[Sample] = field(default_factory=list)     # good + defects
+
+    @property
+    def n_test_good(self) -> int:
+        return sum(1 for s in self.test if s.label == 0)
+
+    @property
+    def n_test_defect(self) -> int:
+        return sum(1 for s in self.test if s.label == 1)
+
+
+def resolve_dataset_root() -> str:
+    """Return the local snapshot dir of the MVTec mirror (downloads metadata if
+    needed; assumes the image/mask files were already fetched via
+    snapshot_download).
+    """
+    from huggingface_hub import hf_hub_download
+
+    meta = hf_hub_download(MVTEC_REPO, "metadata.csv", repo_type="dataset")
+    return os.path.dirname(meta)
+
+
+def load_category(category: str, root: str | None = None) -> CategoryData:
+    """Load train/test sample lists for one MVTec category from metadata.csv."""
+    root = root or resolve_dataset_root()
+    meta_csv = os.path.join(root, "metadata.csv")
+    data = CategoryData(category=category)
+    with open(meta_csv, newline="") as f:
+        for row in csv.DictReader(f):
+            if row["object"] != category:
+                continue
+            rel = row["path"].replace("\\", "/")
+            img_path = os.path.join(root, "images", rel)
+            label = int(row["label"])
+            binary = 0 if label == 0 else 1
+            mask_rel = (row.get("mask_path") or "").replace("\\", "/")
+            mask_path = os.path.join(root, "masks", mask_rel) if mask_rel else None
+            sample = Sample(
+                image_path=img_path,
+                label=binary,
+                defect=row.get("defect", "good"),
+                mask_path=mask_path,
+            )
+            if row["split"] == "train":
+                data.train.append(sample)
+            else:
+                data.test.append(sample)
+    return data
+
+
+def load_image(path: str) -> np.ndarray:
+    """Load an image as an (H, W, 3) uint8 BGR array (cv2-style, brain-native)."""
+    from PIL import Image
+
+    img = Image.open(path).convert("RGB")
+    rgb = np.asarray(img, dtype=np.uint8)        # (H, W, 3) RGB
+    return rgb[:, :, ::-1].copy()                 # -> BGR
+
+
+def load_mask(path: str | None, shape: tuple[int, int]) -> np.ndarray:
+    """Load a binary defect mask as (H, W) uint8 in {0,1}; zeros if no mask."""
+    if not path or not os.path.exists(path):
+        return np.zeros(shape, dtype=np.uint8)
+    from PIL import Image
+
+    m = Image.open(path).convert("L")
+    arr = np.asarray(m, dtype=np.uint8)
+    return (arr > 0).astype(np.uint8)
+
+
+# ── Metrics (numpy-only, no sklearn dependency) ──────────────────────────────
+
+
+def auroc(scores: np.ndarray, labels: np.ndarray) -> float:
+    """Area under ROC via the Mann-Whitney U statistic.
+
+    scores: higher = more anomalous. labels: 1 = anomalous, 0 = normal.
+    Ties handled with average ranks. Returns 0.5 if a class is absent.
+    """
+    scores = np.asarray(scores, dtype=np.float64)
+    labels = np.asarray(labels, dtype=np.int64)
+    n_pos = int((labels == 1).sum())
+    n_neg = int((labels == 0).sum())
+    if n_pos == 0 or n_neg == 0:
+        return 0.5
+    order = np.argsort(scores, kind="mergesort")
+    ranks = np.empty(len(scores), dtype=np.float64)
+    sorted_scores = scores[order]
+    i = 0
+    while i < len(scores):
+        j = i
+        while j + 1 < len(scores) and sorted_scores[j + 1] == sorted_scores[i]:
+            j += 1
+        avg_rank = (i + j) / 2.0 + 1.0  # 1-based average rank for the tie block
+        ranks[order[i:j + 1]] = avg_rank
+        i = j + 1
+    sum_ranks_pos = ranks[labels == 1].sum()
+    u_pos = sum_ranks_pos - n_pos * (n_pos + 1) / 2.0
+    return float(u_pos / (n_pos * n_neg))
+
+
+def best_f1(scores: np.ndarray, labels: np.ndarray) -> dict:
+    """Threshold-swept best F1 (ORACLE threshold on this set — optimistic).
+
+    Returns the threshold maximizing F1 plus precision/recall/F1 there.
+    Phase 3 replaces this with a proper validation-split calibration.
+    """
+    scores = np.asarray(scores, dtype=np.float64)
+    labels = np.asarray(labels, dtype=np.int64)
+    if len(scores) == 0:
+        return {"threshold": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
+    cands = np.unique(scores)
+    best = {"threshold": float(cands[0]), "precision": 0.0, "recall": 0.0, "f1": -1.0}
+    for thr in cands:
+        pred = (scores >= thr).astype(np.int64)
+        tp = int(((pred == 1) & (labels == 1)).sum())
+        fp = int(((pred == 1) & (labels == 0)).sum())
+        fn = int(((pred == 0) & (labels == 1)).sum())
+        prec = tp / (tp + fp) if (tp + fp) else 0.0
+        rec = tp / (tp + fn) if (tp + fn) else 0.0
+        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
+        if f1 > best["f1"]:
+            best = {"threshold": float(thr), "precision": prec, "recall": rec, "f1": f1}
+    return best
+
+
+def pixel_auroc(score_maps: list[np.ndarray], masks: list[np.ndarray],
+                max_pixels: int = 2_000_000) -> float:
+    """Pixel-level AUROC over all defect images (subsampled for tractability)."""
+    flat_scores: list[np.ndarray] = []
+    flat_labels: list[np.ndarray] = []
+    for sm, mk in zip(score_maps, masks):
+        if sm is None or mk is None:
+            continue
+        flat_scores.append(sm.ravel().astype(np.float64))
+        flat_labels.append(mk.ravel().astype(np.int64))
+    if not flat_scores:
+        return float("nan")
+    s = np.concatenate(flat_scores)
+    y = np.concatenate(flat_labels)
+    if y.sum() == 0 or y.sum() == len(y):
+        return float("nan")
+    if len(s) > max_pixels:
+        rng = np.random.default_rng(0)
+        idx = rng.choice(len(s), size=max_pixels, replace=False)
+        s, y = s[idx], y[idx]
+    return auroc(s, y)
@@ -0,0 +1,108 @@
+"""P0-3: prove the global-embedding bottleneck directly.
+
+For one category, embed known-good parts into a bank, then plot the distribution
+of nearest-neighbour distances (1 - max cosine to the good bank) for held-out
+GOOD images vs DEFECT images. If the two distributions overlap heavily, a single
+global embedding cannot separate good from subtly-defective — which is the whole
+justification for the patch-based rework.
+
+    python -m benchmarks.qc_embedding_separation --category carpet
+"""
+
+from __future__ import annotations
+
+import argparse
+
+import numpy as np
+
+from benchmarks.qc_data import auroc, load_category, load_image, resolve_dataset_root
+from benchmarks.qc_perception_eval import GlobalEmbeddingScorer
+
+
+def overlap_coefficient(a: np.ndarray, b: np.ndarray, bins: int = 40) -> float:
+    """Histogram overlap coefficient in [0,1]; 1 = identical, 0 = disjoint."""
+    lo = float(min(a.min(), b.min()))
+    hi = float(max(a.max(), b.max()))
+    edges = np.linspace(lo, hi, bins + 1)
+    ha, _ = np.histogram(a, bins=edges, density=True)
+    hb, _ = np.histogram(b, bins=edges, density=True)
+    width = edges[1] - edges[0]
+    return float(np.minimum(ha, hb).sum() * width)
+
+
+def run(category: str, n_train: int, device: str, out_png: str,
+        results: str) -> None:
+    root = resolve_dataset_root()
+    cat = load_category(category, root)
+    rng = np.random.default_rng(0)
+    train = cat.train
+    if n_train and len(train) > n_train:
+        idx = rng.choice(len(train), size=n_train, replace=False)
+        train = [train[i] for i in idx]
+
+    scorer = GlobalEmbeddingScorer(device=device)
+    scorer.fit([load_image(s.image_path) for s in train])
+
+    good_d, defect_d = [], []
+    for s in cat.test:
+        d = scorer.score(load_image(s.image_path))   # 1 - max cosine to good bank
+        (good_d if s.label == 0 else defect_d).append(d)
+    good_d = np.asarray(good_d)
+    defect_d = np.asarray(defect_d)
+
+    ov = overlap_coefficient(good_d, defect_d)
+    scores = np.concatenate([good_d, defect_d])
+    labels = np.concatenate([np.zeros(len(good_d)), np.ones(len(defect_d))])
+    au = auroc(scores, labels)
+
+    import matplotlib
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
+    fig, ax = plt.subplots(figsize=(7, 4.2))
+    bins = np.linspace(scores.min(), scores.max(), 40)
+    ax.hist(good_d, bins=bins, alpha=0.6, label=f"good (n={len(good_d)})",
+            color="#2a9d8f", density=True)
+    ax.hist(defect_d, bins=bins, alpha=0.6, label=f"defect (n={len(defect_d)})",
+            color="#e76f51", density=True)
+    ax.axvline(good_d.mean(), color="#2a9d8f", ls="--", lw=1)
+    ax.axvline(defect_d.mean(), color="#e76f51", ls="--", lw=1)
+    ax.set_xlabel("NN distance to good bank  (1 - max cosine, SigLIP global embedding)")
+    ax.set_ylabel("density")
+    ax.set_title(f"MVTec '{category}': global-embedding separation\n"
+                 f"AUROC={au:.3f}   histogram overlap={ov:.2f}")
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(out_png, dpi=130)
+    print(f"saved {out_png}")
+    print(f"category={category} AUROC={au:.3f} overlap={ov:.2f} "
+          f"good_mean={good_d.mean():.4f} defect_mean={defect_d.mean():.4f}")
+
+    with open(results, "a") as f:
+        f.write(
+            f"\n### P0-3 global-embedding separation — `{category}`\n"
+            f"- image-level AUROC (NN-distance) = **{au:.3f}**, "
+            f"good/defect histogram overlap = **{ov:.2f}** "
+            f"(1.0 = indistinguishable).\n"
+            f"- good mean dist = {good_d.mean():.4f}, "
+            f"defect mean dist = {defect_d.mean():.4f}.\n"
+            f"- Figure: `{out_png}`. Heavy overlap ⇒ a single global vector "
+            f"cannot separate good from subtly-defective parts; localization is "
+            f"lost. This is the bottleneck the patch path removes.\n"
+        )
+
+
+def main(argv=None) -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--category", default="carpet")
+    p.add_argument("--n-train", type=int, default=80)
+    p.add_argument("--device", default="cpu")
+    p.add_argument("--out", default="")
+    p.add_argument("--results", default="benchmarks/results/qc_perception.md")
+    args = p.parse_args(argv)
+    out = args.out or f"benchmarks/results/p0_3_{args.category}_global_separation.png"
+    run(args.category, args.n_train, args.device, out, args.results)
+
+
+if __name__ == "__main__":
+    main()