Grillcheese-AI
diff --git a/‎model/cubby/attention.py‎
Lines changed: 126 additions & 0 deletions b/‎model/cubby/attention.py‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎model/cubby/blocks.py‎
Lines changed: 218 additions & 0 deletions b/‎model/cubby/blocks.py‎
Lines changed: 218 additions & 0 deletions
@@ -0,0 +1,126 @@
+"""Local sliding-window causal self-attention."""
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class LocalCausalAttention(nn.Module):
+    """Sliding-window causal self-attention.
+
+    Supplements MinGRU's recurrence with precise local token-to-token
+    lookups within a window of W positions. The recurrence handles
+    global context decay; attention handles "who said what" precision.
+
+    True O(L·W) memory and compute when S > window — Q is processed in
+    chunks of W tokens, each chunk attending to its own ``W`` queries
+    against the preceding ``2·W`` keys (so every position sees a full
+    window of W context regardless of where it lands inside its chunk).
+    Each per-chunk SDPA call is ``W · 2W`` so peak attention memory is
+    ``B · H · W · 2W`` rather than ``B · H · L²`` — at L=32k, W=512 that
+    is a 64× reduction over the legacy mask-based implementation, and
+    crucially does not require building an L×L mask in the first place.
+
+    Optional gradient checkpointing (``grad_checkpoint=True``) trades a
+    second forward pass at backprop time for ~2× lower activation
+    memory inside the attention layer — useful at long sequences where
+    activations dominate.
+
+    Uses PyTorch's ``scaled_dot_product_attention`` for each chunk, so
+    FlashAttention-2 kernel fusion still kicks in on A100/H100/H200.
+    """
+
+    def __init__(self, d_model: int, n_heads: int = 4, window: int = 128,
+                 grad_checkpoint: bool = False):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.n_heads = n_heads
+        self.d_head = d_model // n_heads
+        self.window = window
+        self.grad_checkpoint = bool(grad_checkpoint)
+        self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.training and self.grad_checkpoint:
+            # use_reentrant=False is the modern path; matches the
+            # forward graph + handles non-tensor outputs gracefully.
+            from torch.utils.checkpoint import checkpoint
+            return checkpoint(self._forward_impl, x, use_reentrant=False)
+        return self._forward_impl(x)
+
+    def _forward_impl(self, x: torch.Tensor) -> torch.Tensor:
+        B, S, D = x.shape
+        qkv = self.qkv(x).reshape(B, S, 3, self.n_heads, self.d_head)
+        q, k, v = qkv.unbind(dim=2)                        # each (B, S, H, Dh)
+        q = q.transpose(1, 2).contiguous()                 # (B, H, S, Dh)
+        k = k.transpose(1, 2).contiguous()
+        v = v.transpose(1, 2).contiguous()
+
+        if S <= self.window:
+            # Full causal SDPA — window covers everything; no mask build.
+            out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        else:
+            out = self._chunked_sliding_window(q, k, v)
+
+        out = out.transpose(1, 2).contiguous().reshape(B, S, D)
+        return self.out_proj(out)
+
+    def _chunked_sliding_window(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+    ) -> torch.Tensor:
+        """O(L·W) sliding-window attention via Q chunking.
+
+        Process Q in chunks of ``W`` positions. For each chunk starting
+        at ``c_start``, gather keys/values from
+        ``[max(0, c_start - W + 1) : c_end]`` — that's the union of
+        every chunk position's window (each position ``p`` attends to
+        ``[p - W + 1 .. p]``). Each per-chunk attention is then a small
+        ``c_len × (≤ 2W - 1)`` SDPA call.
+
+        Mask within a chunk: query at chunk-local row ``i`` (absolute
+        ``c_start + i``) attends to key-array index ``col`` (absolute
+        ``k_start + col``) iff
+        ``c_start + i - W + 1 <= k_start + col <= c_start + i``.
+        Substituting ``offset = c_start - k_start`` gives
+        ``offset + i - W + 1 <= col <= offset + i``.
+
+        Memory: peak attention block is ``B · H · W · (2W - 1)`` per
+        chunk, independent of ``S``. The number of chunks
+        ``ceil(S / W)`` is the linear-in-L factor.
+        """
+        W = self.window
+        S = q.shape[2]
+        out = torch.empty_like(q)
+
+        for c_start in range(0, S, W):
+            c_end = min(c_start + W, S)
+            c_len = c_end - c_start
+            k_start = max(0, c_start - W + 1)     # earliest position any chunk-row needs
+            kv_len = c_end - k_start
+            local_q = q[:, :, c_start:c_end]       # (B, H, c_len, Dh)
+            local_k = k[:, :, k_start:c_end]       # (B, H, kv_len, Dh)
+            local_v = v[:, :, k_start:c_end]
+            offset = c_start - k_start             # c_start when < W-1 else W-1
+
+            if c_start == 0:
+                # First chunk — windowed causal collapses to plain
+                # causal because the lower bound (col >= i - W + 1) is
+                # always satisfied by col >= 0 when i < W.
+                local_out = F.scaled_dot_product_attention(
+                    local_q, local_k, local_v, is_causal=True)
+            else:
+                row = torch.arange(c_len, device=q.device).unsqueeze(1)   # (c_len, 1)
+                col = torch.arange(kv_len, device=q.device).unsqueeze(0)  # (1, kv_len)
+                lo = offset + row - W + 1          # window lower bound
+                hi = offset + row                  # window upper bound (causal)
+                allowed = (col >= lo) & (col <= hi)
+                attn_mask = torch.zeros(
+                    c_len, kv_len, device=q.device, dtype=q.dtype)
+                attn_mask.masked_fill_(~allowed, float("-inf"))
+                local_out = F.scaled_dot_product_attention(
+                    local_q, local_k, local_v, attn_mask=attn_mask)
+            out[:, :, c_start:c_end] = local_out
+        return out
+
+
@@ -0,0 +1,218 @@
+"""Cubby blocks: HybridBlock (MoE + attention + memory + GLU)
+and MinGRUBlock (pure MinGRU + GLU)."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from nn_primitives import RMSNorm
+from layers import MinGRULayer, GLUChannelMix
+from moe import MoEMinGRULayer
+from attention import LocalCausalAttention
+from episodic_memory import EpisodicMemory
+
+if TYPE_CHECKING:
+    from train_torch import TrainConfig
+
+class HybridBlock(nn.Module):
+    """MinGRU (or MoE-MinGRU) + optional local attention + optional
+    hippocampal memory injection + GLU.
+
+    Architecture per block::
+
+        x = x + Mixer(RMSNorm(x))         # MinGRU or MoE-MinGRU
+        x = x + Attn(RMSNorm(x))          # local attention (if enabled)
+        x = x + MemGate(hippo.read(x))    # memory injection (if enabled)
+        x = x + GLU(RMSNorm(x))           # channel mixing
+
+    Memory injection: the block's mean hidden state queries the episodic
+    memory; retrieved memories are projected through a learned gate and
+    broadcast to all positions. This doesn't affect gradients on the
+    memory itself — only the gate is trainable.
+    """
+
+    def __init__(self, cfg: "TrainConfig", layer_idx: int,
+                 memory: "EpisodicMemory | None" = None):
+        super().__init__()
+        d = cfg.d_model
+        self.layer_idx = layer_idx
+
+        # Sequence mixer
+        self.rms_mix = RMSNorm(d)
+        if cfg.enable_moe:
+            self.mix = MoEMinGRULayer(
+                d, cfg.moe_n_experts, cfg.moe_top_k,
+                enable_hypergrad=cfg.enable_hypergrad,
+                hypergrad_scale_init=cfg.hypergrad_scale_init,
+                gate_noise_std=getattr(cfg, "moe_gate_noise_std", 0.0),
+                gate_init_std=getattr(cfg, "moe_gate_init_std", 0.0),
+                decay_bias_stagger=getattr(cfg, "moe_decay_bias_stagger", False),
+                decay_bias_lo=getattr(cfg, "moe_decay_bias_lo", -1.0),
+                decay_bias_hi=getattr(cfg, "moe_decay_bias_hi", 2.0),
+                enable_hebbian_growth=getattr(cfg, "enable_hebbian_growth", False),
+                hebbian_n_components=getattr(cfg, "hebbian_n_components", 8),
+                hebbian_max_components=getattr(cfg, "hebbian_max_components", 64),
+                hebbian_grow_threshold=getattr(cfg, "hebbian_grow_threshold", 0.35),
+                hebbian_lr=getattr(cfg, "hebbian_lr", 5e-4),
+                hebbian_lateral_beta=getattr(cfg, "hebbian_lateral_beta", 0.05),
+                hebbian_grow_cooldown=getattr(cfg, "hebbian_grow_cooldown", 100),
+                hebbian_ema=getattr(cfg, "hebbian_ema", 0.01),
+                max_experts=getattr(cfg, "moe_max_experts", 32),
+                n_shared=getattr(cfg, "moe_shared_experts", 0),
+            )
+        else:
+            self.mix = MinGRULayer(
+                d,
+                enable_hypergrad=cfg.enable_hypergrad,
+                hypergrad_scale_init=cfg.hypergrad_scale_init,
+            )
+
+        # v4 Group-Routing: wrap the per-token mixer with a GroupedMoEBlock.
+        # Tokens are pooled into G < S groups, the inner MoE runs per-group
+        # (the inner MoE doesn't know "S" became "G" — it just sees a shorter
+        # sequence dim), then outputs are scattered back to per-token. Trade-off:
+        # syntactic coherence + cheaper routing for loss of per-token routing
+        # freedom. Opt-in via cfg.enable_group_routing; default off so existing
+        # configs are byte-identical until explicitly switched on.
+        if getattr(cfg, "enable_group_routing", False) and cfg.enable_moe:
+            from group_routing import (
+                FixedSizeGrouping, LearnedGrouping, HebbianGrouping,
+                SupervisedSVCGrouping, GroupedMoEBlock, GroupedMoEBlockBias,
+            )
+            strategy = getattr(cfg, "grouping_strategy", "fixed_size")
+            if strategy == "fixed_size":
+                grouping_fn = FixedSizeGrouping(
+                    group_size=getattr(cfg, "group_size", 4))
+            elif strategy == "learned":
+                grouping_fn = LearnedGrouping(
+                    d_model=d,
+                    n_groups=getattr(cfg, "n_groups", 16),
+                    temperature=getattr(cfg, "group_temperature", 1.0))
+            elif strategy == "hebbian":
+                # Caller threads the basis per forward via grouping_fn.set_basis(W).
+                grouping_fn = HebbianGrouping(
+                    sig_dim=getattr(cfg, "hebbian_n_components", 8))
+            elif strategy == "svc":
+                grouping_fn = SupervisedSVCGrouping(
+                    fallback_group_size=getattr(cfg, "group_size", 4))
+            else:
+                raise ValueError(
+                    f"unknown cfg.grouping_strategy={strategy!r}; expected one of "
+                    f"'fixed_size', 'learned', 'hebbian', 'svc'")
+            # Dispatch on the scatter-back variant. "bias" adds a per-token
+            # D→D linear projection to the per-group MoE output so each
+            # token in a group gets per-position variation in the residual
+            # stream — fixes the autoregressive echo-collapse observed
+            # with pure replication. "replicated" is legacy/ablation only.
+            variant = getattr(cfg, "group_routing_variant", "bias")
+            if variant == "bias":
+                BlockCls = GroupedMoEBlockBias
+            elif variant == "replicated":
+                BlockCls = GroupedMoEBlock
+            else:
+                raise ValueError(
+                    f"unknown cfg.group_routing_variant={variant!r}; expected "
+                    f"'bias' (recommended) or 'replicated' (legacy)")
+            self.mix = BlockCls(
+                d_model=d, inner_moe=self.mix, grouping_fn=grouping_fn)
+            self.uses_group_routing = True
+            self.group_routing_variant = variant
+        else:
+            self.uses_group_routing = False
+            self.group_routing_variant = None
+
+        # Local attention (on selected layers).
+        # Always register placeholders (nn.Identity) so the module's
+        # _modules dict has a stable shape across all block instances.
+        # Dynamo guards on attribute location — mixing __dict__-stored
+        # `None` and _modules-stored Module triggers per-layer recompiles
+        # and blows the cache_size_limit.
+        self.has_attn = bool(cfg.enable_attention
+                             and layer_idx % cfg.attn_every_n == 0)
+        if self.has_attn:
+            self.rms_attn = RMSNorm(d)
+            self.attn = LocalCausalAttention(
+                d, cfg.attn_n_heads, cfg.attn_window,
+                grad_checkpoint=getattr(cfg, "attn_grad_checkpoint", False),
+            )
+        else:
+            self.rms_attn = nn.Identity()
+            self.attn = nn.Identity()
+
+        # Hippocampal memory injection (on selected layers). Same dict-
+        # location stability concern as attention above.
+        self.has_memory = bool(memory is not None and cfg.enable_memory
+                               and layer_idx % cfg.mem_every_n == 0)
+        if self.has_memory:
+            self.memory = memory
+            self.mem_gate = nn.Linear(d, d, bias=False)
+            nn.init.zeros_(self.mem_gate.weight)  # start as no-op
+        else:
+            self.memory = None  # external state, not a Module — kept in __dict__
+            self.mem_gate = nn.Identity()
+
+        # FFN
+        self.rms_ffn = RMSNorm(d)
+        self.ffn = GLUChannelMix(d, cfg.d_ffn)
+
+        # Learned residual scaling (ZAYA1-8B / OpenMythos 2026). One scalar
+        # per residual addition, initialised to 1.0 — at step 0 the block
+        # is byte-identical to the un-scaled baseline. Per-stream gates
+        # let the optimiser dampen norm growth through depth (matters most
+        # at L >= 18) without touching layer weights. Total overhead per
+        # HybridBlock: 4 params, ~0 FLOPs.
+        self.enable_residual_scale = bool(
+            getattr(cfg, "enable_residual_scale", False))
+        if self.enable_residual_scale:
+            self.alpha_mix = nn.Parameter(torch.ones(1))
+            self.alpha_attn = nn.Parameter(torch.ones(1))
+            self.alpha_mem = nn.Parameter(torch.ones(1))
+            self.alpha_ffn = nn.Parameter(torch.ones(1))
+
+    def forward(self, x: torch.Tensor, surprise_gain: float = 0.0) -> torch.Tensor:
+        if self.enable_residual_scale:
+            x = x + self.alpha_mix * self.mix(self.rms_mix(x),
+                                              surprise_gain=surprise_gain)
+            if self.has_attn:
+                x = x + self.alpha_attn * self.attn(self.rms_attn(x))
+            if self.has_memory and self.memory is not None and self.memory.size > 0:
+                x_mean = x.mean(dim=1)
+                retrieved = self.memory.read(x_mean[0])
+                mem_inject = self.mem_gate(retrieved)
+                x = x + self.alpha_mem * mem_inject.unsqueeze(0).unsqueeze(0)
+            x = x + self.alpha_ffn * self.ffn(self.rms_ffn(x))
+        else:
+            x = x + self.mix(self.rms_mix(x), surprise_gain=surprise_gain)
+            if self.has_attn:
+                x = x + self.attn(self.rms_attn(x))
+            if self.has_memory and self.memory is not None and self.memory.size > 0:
+                x_mean = x.mean(dim=1)
+                retrieved = self.memory.read(x_mean[0])
+                mem_inject = self.mem_gate(retrieved)
+                x = x + mem_inject.unsqueeze(0).unsqueeze(0)
+            x = x + self.ffn(self.rms_ffn(x))
+        return x
+
+
+class MinGRUBlock(nn.Module):
+    """Original pure-MinGRU block (no MoE, no attention)."""
+    def __init__(self, cfg: TrainConfig):
+        super().__init__()
+        self.rms_mix = RMSNorm(cfg.d_model)
+        self.rms_ffn = RMSNorm(cfg.d_model)
+        self.mix = MinGRULayer(
+            cfg.d_model,
+            enable_hypergrad=cfg.enable_hypergrad,
+            hypergrad_scale_init=cfg.hypergrad_scale_init,
+        )
+        self.ffn = GLUChannelMix(cfg.d_model, cfg.d_ffn)
+
+    def forward(self, x: torch.Tensor, surprise_gain: float = 0.0) -> torch.Tensor:
+        x = x + self.mix(self.rms_mix(x), surprise_gain=surprise_gain)
+        x = x + self.ffn(self.rms_ffn(x))
+        return x
+
+