Skip to content

Commit 5da5e65

Browse files
tbitcsoz-agent
andcommitted
feat: anchor quality audit + auto-fix, cleanup 605→285 anchors
Major data quality cleanup: - Removed 215 phantom anchors (signs not in Holdat corpus) - Removed 105 anchors with empty readings - Downgraded 105 bulk-duplicate readings to CANDIDATE (48 signs sharing 'mīn', 20 sharing 'kur', etc.) - Coverage recalculated: 96.6% → 91.9% (honest number) New API endpoints: - POST /api/v1/signs/audit — run quality audit, returns issues - POST /api/v1/signs/audit/fix — auto-fix with backup Scripts: - _audit_anchors.py: 7-criterion quality check - _fix_anchors.py: auto-cleanup with backup UI: renamed 'Prune' to 'Delete' for consistency. Fixed optimistic override race condition in staging review. Co-Authored-By: Oz <oz-agent@warp.dev>
1 parent 5b10253 commit 5da5e65

16 files changed

Lines changed: 8494 additions & 4074 deletions

backend/glossa_lab/api/signs.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,3 +317,131 @@ async def get_sign(sign_id: str) -> dict[str, Any]:
317317
if entry is None:
318318
raise HTTPException(status_code=404, detail=f"Sign '{sign_id}' not found")
319319
return entry
320+
321+
322+
@router.post("/audit")
323+
async def audit_anchors() -> dict[str, Any]:
324+
"""Run a quality audit on all anchors and return issues.
325+
326+
Checks: corpus presence, DEDR support, duplicate readings,
327+
empty readings, low-frequency HIGH anchors, positional mismatches.
328+
"""
329+
import csv as _csv # noqa: PLC0415
330+
from collections import Counter as _Counter, defaultdict as _dd # noqa: PLC0415
331+
from pathlib import Path as _P # noqa: PLC0415
332+
333+
repo = _P(__file__).resolve().parents[3]
334+
anchors_path = repo / "backend" / "reports" / "INDUS_FINAL_ANCHORS.json"
335+
try:
336+
from glossa_lab.config import get_project_config # noqa: PLC0415
337+
holdat_path = get_project_config().corpus_csv_path()
338+
except Exception: # noqa: BLE001
339+
holdat_path = repo / "corpora" / "downloads" / "external_repos" / "holdatllc_indus" / "indus_corpus 2.csv"
340+
341+
if not anchors_path.exists():
342+
return {"error": "INDUS_FINAL_ANCHORS.json not found"}
343+
344+
fa = json.loads(anchors_path.read_text(encoding="utf-8"))
345+
anch = fa.get("anchors", {})
346+
347+
# Load corpus
348+
corpus_freq: _Counter[str] = _Counter()
349+
if holdat_path.exists():
350+
with open(holdat_path, encoding="utf-8") as f:
351+
for row in _csv.DictReader(f):
352+
sign = row.get("letters", "").strip()
353+
if sign:
354+
corpus_freq[sign] += 1
355+
356+
issues: list[dict[str, str]] = []
357+
by_conf = _Counter(v.get("confidence", "?") for v in anch.values())
358+
359+
# Check 1: not in corpus
360+
not_in_corpus = [sid for sid in anch if sid not in corpus_freq]
361+
for sid in not_in_corpus:
362+
issues.append({"sign": sid, "issue": "not_in_corpus", "severity": "warn",
363+
"detail": f"{sid} not in Holdat corpus"})
364+
365+
# Check 2: empty readings
366+
empty = [sid for sid, info in anch.items() if not (info.get("reading") or "").strip()]
367+
for sid in empty:
368+
issues.append({"sign": sid, "issue": "empty_reading", "severity": "high",
369+
"detail": f"{sid} has empty reading"})
370+
371+
# Check 3: duplicate readings (>5 signs)
372+
rtosigns: dict[str, list[str]] = _dd(list)
373+
for sid, info in anch.items():
374+
r = (info.get("reading") or "").strip().lower()
375+
if r:
376+
rtosigns[r].append(sid)
377+
dups = {r: s for r, s in rtosigns.items() if len(s) > 5}
378+
for r, signs in dups.items():
379+
issues.append({"sign": signs[0], "issue": "duplicate_reading",
380+
"severity": "high",
381+
"detail": f"'{r}' assigned to {len(signs)} signs"})
382+
383+
# Check 4: HIGH with <5 occurrences
384+
low_freq = [(sid, corpus_freq.get(sid, 0))
385+
for sid, info in anch.items()
386+
if info.get("confidence") == "HIGH" and corpus_freq.get(sid, 0) < 5]
387+
for sid, freq in low_freq:
388+
issues.append({"sign": sid, "issue": "high_low_freq", "severity": "warn",
389+
"detail": f"{sid} [HIGH] freq={freq}"})
390+
391+
high_issues = [i for i in issues if i["severity"] == "high"]
392+
warn_issues = [i for i in issues if i["severity"] == "warn"]
393+
394+
return {
395+
"total_anchors": len(anch),
396+
"by_confidence": dict(by_conf),
397+
"total_issues": len(issues),
398+
"high_issues": len(high_issues),
399+
"warn_issues": len(warn_issues),
400+
"not_in_corpus": len(not_in_corpus),
401+
"empty_readings": len(empty),
402+
"duplicate_readings": len(dups),
403+
"low_freq_high": len(low_freq),
404+
"issues": issues[:50], # cap response size
405+
}
406+
407+
408+
@router.post("/audit/fix")
409+
async def fix_anchors() -> dict[str, Any]:
410+
"""Auto-fix anchor quality issues.
411+
412+
Removes phantom anchors (not in corpus), empty readings,
413+
and downgrades bulk-duplicate readings to CANDIDATE.
414+
Creates a backup before modifying.
415+
"""
416+
import subprocess # noqa: PLC0415, S404
417+
import sys # noqa: PLC0415
418+
from pathlib import Path as _P # noqa: PLC0415
419+
420+
script = _P(__file__).resolve().parents[2] / "scripts" / "_fix_anchors.py"
421+
if not script.exists():
422+
raise HTTPException(404, "_fix_anchors.py not found")
423+
424+
proc = subprocess.run( # noqa: S603
425+
[sys.executable, str(script)],
426+
capture_output=True, text=True, timeout=30,
427+
)
428+
ok = proc.returncode == 0
429+
# Invalidate signs index so next request picks up changes
430+
if ok:
431+
invalidate_signs_index()
432+
try:
433+
from glossa_lab.api.foundation import mark_dirty # noqa: PLC0415
434+
mark_dirty()
435+
except Exception: # noqa: BLE001
436+
pass
437+
try:
438+
from glossa_lab.api.dashboard import mark_insights_stale # noqa: PLC0415
439+
mark_insights_stale()
440+
except Exception: # noqa: BLE001
441+
pass
442+
443+
return {
444+
"ok": ok,
445+
"stdout": proc.stdout[-1000:] if proc.stdout else "",
446+
"stderr": proc.stderr[-500:] if proc.stderr else "",
447+
}

0 commit comments

Comments
 (0)