@@ -317,3 +317,131 @@ async def get_sign(sign_id: str) -> dict[str, Any]:
317317 if entry is None :
318318 raise HTTPException (status_code = 404 , detail = f"Sign '{ sign_id } ' not found" )
319319 return entry
320+
321+
322+ @router .post ("/audit" )
323+ async def audit_anchors () -> dict [str , Any ]:
324+ """Run a quality audit on all anchors and return issues.
325+
326+ Checks: corpus presence, DEDR support, duplicate readings,
327+ empty readings, low-frequency HIGH anchors, positional mismatches.
328+ """
329+ import csv as _csv # noqa: PLC0415
330+ from collections import Counter as _Counter , defaultdict as _dd # noqa: PLC0415
331+ from pathlib import Path as _P # noqa: PLC0415
332+
333+ repo = _P (__file__ ).resolve ().parents [3 ]
334+ anchors_path = repo / "backend" / "reports" / "INDUS_FINAL_ANCHORS.json"
335+ try :
336+ from glossa_lab .config import get_project_config # noqa: PLC0415
337+ holdat_path = get_project_config ().corpus_csv_path ()
338+ except Exception : # noqa: BLE001
339+ holdat_path = repo / "corpora" / "downloads" / "external_repos" / "holdatllc_indus" / "indus_corpus 2.csv"
340+
341+ if not anchors_path .exists ():
342+ return {"error" : "INDUS_FINAL_ANCHORS.json not found" }
343+
344+ fa = json .loads (anchors_path .read_text (encoding = "utf-8" ))
345+ anch = fa .get ("anchors" , {})
346+
347+ # Load corpus
348+ corpus_freq : _Counter [str ] = _Counter ()
349+ if holdat_path .exists ():
350+ with open (holdat_path , encoding = "utf-8" ) as f :
351+ for row in _csv .DictReader (f ):
352+ sign = row .get ("letters" , "" ).strip ()
353+ if sign :
354+ corpus_freq [sign ] += 1
355+
356+ issues : list [dict [str , str ]] = []
357+ by_conf = _Counter (v .get ("confidence" , "?" ) for v in anch .values ())
358+
359+ # Check 1: not in corpus
360+ not_in_corpus = [sid for sid in anch if sid not in corpus_freq ]
361+ for sid in not_in_corpus :
362+ issues .append ({"sign" : sid , "issue" : "not_in_corpus" , "severity" : "warn" ,
363+ "detail" : f"{ sid } not in Holdat corpus" })
364+
365+ # Check 2: empty readings
366+ empty = [sid for sid , info in anch .items () if not (info .get ("reading" ) or "" ).strip ()]
367+ for sid in empty :
368+ issues .append ({"sign" : sid , "issue" : "empty_reading" , "severity" : "high" ,
369+ "detail" : f"{ sid } has empty reading" })
370+
371+ # Check 3: duplicate readings (>5 signs)
372+ rtosigns : dict [str , list [str ]] = _dd (list )
373+ for sid , info in anch .items ():
374+ r = (info .get ("reading" ) or "" ).strip ().lower ()
375+ if r :
376+ rtosigns [r ].append (sid )
377+ dups = {r : s for r , s in rtosigns .items () if len (s ) > 5 }
378+ for r , signs in dups .items ():
379+ issues .append ({"sign" : signs [0 ], "issue" : "duplicate_reading" ,
380+ "severity" : "high" ,
381+ "detail" : f"'{ r } ' assigned to { len (signs )} signs" })
382+
383+ # Check 4: HIGH with <5 occurrences
384+ low_freq = [(sid , corpus_freq .get (sid , 0 ))
385+ for sid , info in anch .items ()
386+ if info .get ("confidence" ) == "HIGH" and corpus_freq .get (sid , 0 ) < 5 ]
387+ for sid , freq in low_freq :
388+ issues .append ({"sign" : sid , "issue" : "high_low_freq" , "severity" : "warn" ,
389+ "detail" : f"{ sid } [HIGH] freq={ freq } " })
390+
391+ high_issues = [i for i in issues if i ["severity" ] == "high" ]
392+ warn_issues = [i for i in issues if i ["severity" ] == "warn" ]
393+
394+ return {
395+ "total_anchors" : len (anch ),
396+ "by_confidence" : dict (by_conf ),
397+ "total_issues" : len (issues ),
398+ "high_issues" : len (high_issues ),
399+ "warn_issues" : len (warn_issues ),
400+ "not_in_corpus" : len (not_in_corpus ),
401+ "empty_readings" : len (empty ),
402+ "duplicate_readings" : len (dups ),
403+ "low_freq_high" : len (low_freq ),
404+ "issues" : issues [:50 ], # cap response size
405+ }
406+
407+
408+ @router .post ("/audit/fix" )
409+ async def fix_anchors () -> dict [str , Any ]:
410+ """Auto-fix anchor quality issues.
411+
412+ Removes phantom anchors (not in corpus), empty readings,
413+ and downgrades bulk-duplicate readings to CANDIDATE.
414+ Creates a backup before modifying.
415+ """
416+ import subprocess # noqa: PLC0415, S404
417+ import sys # noqa: PLC0415
418+ from pathlib import Path as _P # noqa: PLC0415
419+
420+ script = _P (__file__ ).resolve ().parents [2 ] / "scripts" / "_fix_anchors.py"
421+ if not script .exists ():
422+ raise HTTPException (404 , "_fix_anchors.py not found" )
423+
424+ proc = subprocess .run ( # noqa: S603
425+ [sys .executable , str (script )],
426+ capture_output = True , text = True , timeout = 30 ,
427+ )
428+ ok = proc .returncode == 0
429+ # Invalidate signs index so next request picks up changes
430+ if ok :
431+ invalidate_signs_index ()
432+ try :
433+ from glossa_lab .api .foundation import mark_dirty # noqa: PLC0415
434+ mark_dirty ()
435+ except Exception : # noqa: BLE001
436+ pass
437+ try :
438+ from glossa_lab .api .dashboard import mark_insights_stale # noqa: PLC0415
439+ mark_insights_stale ()
440+ except Exception : # noqa: BLE001
441+ pass
442+
443+ return {
444+ "ok" : ok ,
445+ "stdout" : proc .stdout [- 1000 :] if proc .stdout else "" ,
446+ "stderr" : proc .stderr [- 500 :] if proc .stderr else "" ,
447+ }
0 commit comments