Source code for deepSTRF.training.fitter

"""``deepSTRF.training.Fitter`` — a thin, opt-in PyTorch training loop.

See ``docs/_source/md/fitter.md`` for the full design contract. This module
implements the canonical 3-line training step from
``metrics_paradigm.md`` §7, plus early stopping, checkpoint selection, and
cross-batch metric accumulation. The class is intentionally short — when
something doesn't fit (multi-GPU, mixed-precision, curricula, ...) the
recommended path is to write the loop, not to extend the Fitter.
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Callable, Dict, List, Mapping, Optional, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader

from deepSTRF.metrics import corrcoef, mse_loss, normalized_corrcoef


# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------


def _default_val_metrics() -> Dict[str, Callable]:
    """Canonical val-metric pair: raw and noise-corrected Pearson correlation."""
    return {
        "cc": lambda pred, responses: corrcoef(pred, responses, reduction="none"),
        "cc_norm": lambda pred, responses: normalized_corrcoef(
            pred, responses, method="schoppe", reduction="none"
        ),
    }


def _to_scalar(x: Any) -> float:
    """Reduce a per-neuron tensor to a scalar via ``nanmean``; pass scalars through."""
    if isinstance(x, torch.Tensor):
        if x.numel() == 0:
            return float("nan")
        if x.numel() == 1:
            return float(x.detach().item())
        return float(torch.nanmean(x.detach()).item())
    return float(x)


def _format_epoch(epoch_dict: Mapping[str, Any]) -> None:
    """Default ``log_fn``: ``epoch | k=v | k=v | ...`` to stdout."""
    parts = []
    for k, v in epoch_dict.items():
        if k == "epoch":
            parts.append(f"epoch {int(v):4d}")
        else:
            parts.append(f"{k}={_to_scalar(v):.4f}")
    print(" | ".join(parts))


def _pad_and_cat(tensors: List[torch.Tensor]) -> torch.Tensor:
    """Right-pad each tensor along its last two axes (R, T) with NaN, cat along dim 0.

    All tensors must share dims (B_i, N, R_i, T_i) with the same ``N``. ``B_i``
    may vary (e.g. last partial batch); ``R_i`` and ``T_i`` may vary per batch
    and are padded to the global max with NaN. The NaN pads are dropped by
    every ``deepSTRF.metrics`` function via its NaN-derived mask
    (``metrics_paradigm.md`` §4).
    """
    if not tensors:
        raise ValueError("cannot concatenate an empty list of tensors")
    max_R = max(t.shape[2] for t in tensors)
    max_T = max(t.shape[3] for t in tensors)
    padded = []
    for t in tensors:
        pad_R = max_R - t.shape[2]
        pad_T = max_T - t.shape[3]
        if pad_R or pad_T:
            # F.pad takes (left_T, right_T, left_R, right_R) — back-to-front.
            t = F.pad(t, (0, pad_T, 0, pad_R), value=float("nan"))
        padded.append(t)
    return torch.cat(padded, dim=0)


# -----------------------------------------------------------------------------
# Fitter
# -----------------------------------------------------------------------------



[docs]
class Fitter:
    """Opt-in training loop for a deepSTRF :class:`NeuralModel`.

    See ``docs/_source/md/fitter.md`` for the full design.

    Parameters
    ----------
    model
        Any ``nn.Module`` whose ``forward`` emits ``(B, N, 1, T)`` predictions.
        Stateful models may implement ``model.detach()`` (no-op by default on
        ``deepSTRF.models.NeuralModel``); the Fitter calls it after every step.
    train_loader, val_loader
        ``DataLoader`` instances built with ``deepSTRF.utils.data.neural_collate``.
        ``val_loader`` with ``batch_size=1`` is the simplest case but any
        batch size works thanks to NaN-pad-and-cat (§6).
    loss_fn
        Callable ``(pred, responses) -> Tensor``. Default ``mse_loss``. The
        deepSTRF losses auto-collapse ``responses`` to PSTH internally
        (``metrics_paradigm.md`` §2), so no caller-side ``nanmean`` is needed.
    val_metrics
        Mapping ``name -> callable(pred, responses) -> per-neuron Tensor``.
        Default: the canonical ``{'cc', 'cc_norm'}`` pair. Stored under
        ``f'val_{name}'`` in the epoch dict.
    optimizer
        Any ``torch.optim.Optimizer``. Default: ``AdamW(model.parameters(),
        lr=1e-3, weight_decay=1e-4)``.
    device
        Where to place the model and per-batch tensors.
    max_epochs
        Hard cap on training epochs.
    patience
        Early-stop patience: number of epochs without improvement on
        ``monitor`` before the loop terminates.
    min_delta
        Minimum change in ``monitor`` that counts as an improvement (and so
        resets the patience counter / saves a new best checkpoint). Default
        ``0.0`` (any strict improvement counts). Set a small positive value
        (e.g. ``1e-5``) when the monitored quantity micro-fluctuates on a
        plateau — otherwise sub-noise wiggles keep resetting patience and the
        loop never early-stops, forcing reliance on ``max_epochs``. With
        ``min_delta > 0`` patience can be the sole stopping criterion and
        ``max_epochs`` set effectively unbounded.

        Note: leaving ``min_delta = 0`` (the default) is often *better* for final
        accuracy — validation noise is unbiased, so a new best-on-val is a
        genuinely better point worth keeping (and the ckpt captures it). Pair
        ``min_delta = 0`` with ``reduce_lr_on_plateau`` for long, high-quality fits.
    reduce_lr_on_plateau
        If ``True``, attach a :class:`~torch.optim.lr_scheduler.ReduceLROnPlateau`
        on ``monitor`` that drops the learning rate by ``lr_factor`` after
        ``lr_patience`` epochs without improvement. ``min_lr`` is pinned to
        ``base_lr * lr_factor`` so exactly **one** reduction can occur. When the
        LR drops, the early-stop patience counter is reset so the model gets a
        fresh window at the lower LR before stopping. Default ``False``.
    lr_factor, lr_patience, lr_threshold
        Multiplicative LR-drop factor (default ``0.2``), plateau patience
        (default ``30``), and the relative improvement threshold below which an
        epoch counts as "no improvement" for the scheduler (default ``1e-4``,
        PyTorch's default). Used only when ``reduce_lr_on_plateau=True``. With
        ``min_delta=0`` (keep every best-on-val), a too-small ``lr_threshold``
        lets sub-noise val_loss creep masquerade as progress so the LR never
        drops — raise it (e.g. ``1e-3``) so the drop fires on a genuine plateau.
        Re-applied on resume, so it can be changed when continuing a run.
    patience_after_lr_drop
        If set, the early-stop patience switches to this (typically smaller)
        value once the LR has dropped — the model is annealed and near-converged
        then, so it needn't wait as long. Default ``None`` (use ``patience``
        throughout). Tracked across resume via the saved state.
    ema_decay
        If set (e.g. ``0.999``), maintain an exponential moving average of the
        model's weights (updated every optimizer step) and use the EMA weights
        for validation, the best checkpoint, and the final restored model — a
        cheap generalization boost, especially with noisy validation. Default
        ``None`` (no EMA). Resumable (EMA state saved/restored via ``state_path``).
    monitor
        Key in the per-epoch dict to track for early stopping. Default
        ``'val_cc_norm'``. Use ``'val_loss'``, ``'val_cc'``, or any custom
        key you added via ``val_metrics``.
    mode
        ``'max'`` or ``'min'`` — direction of improvement on ``monitor``.
        Default ``'max'`` (paired with ``'val_cc_norm'``).
    ckpt_path
        If given, save the best-on-``monitor`` ``state_dict`` to this path
        and restore it at the end of ``fit()``.
    state_path
        If given, save the FULL training state (model + optimizer + LR-scheduler
        state + epoch + best score + patience counter + history) to this path
        after every epoch (atomically). If the file already exists at the start
        of ``fit()``, training **resumes** from it — so a killed/interrupted run
        continues exactly where it left off (unlike ``ckpt_path``, which only
        holds best model weights). Default ``None``.
    log_fn
        Called as ``log_fn(epoch_dict)`` once per epoch. Default: a small
        formatter that prints ``epoch | k=v | ...``. Override to log to
        WandB, MLflow, a file, etc.
    track_train_metrics
        If ``True`` (default), recompute ``val_metrics`` over the training
        predictions accumulated this epoch and add them to the epoch dict
        as ``'train_<name>'``. Useful for diagnosing overfitting but
        expensive on large datasets — accumulating ``(B, N, R, T)``
        responses across all train batches is the dominant per-epoch cost
        when ``N × R × T`` is in the millions (e.g. AA2's 494-cell
        population). Set to ``False`` to skip; ``train_loss`` is always
        reported.
    track_per_cell_best
        If ``True``, maintain a per-cell best-on-``monitor`` snapshot of
        the readout's per-N parameter and buffer slices throughout
        training. At end-of-fit, after the global ``ckpt_path`` restore,
        each cell's slice is overlaid with its individual-best snapshot.
        On no-shared-params models this is **strictly** at least as good
        as the vanilla restore on the validation set, cell-by-cell, by
        construction — every cell ends up at its individual val peak.
        The training trajectory itself is unchanged (no gradient masking,
        no per-cell stopping); the only difference is which checkpoint
        is restored at end. Requires ``val_metrics[monitor.removeprefix
        ('val_')]`` to return a ``(N,)`` per-cell tensor (the default
        :func:`_default_val_metrics` does this). Default ``False``.
    """

    def __init__(
        self,
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        *,
        loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = mse_loss,
        val_metrics: Optional[Dict[str, Callable]] = None,
        optimizer: Optional[Optimizer] = None,
        device: Union[str, torch.device] = "cpu",
        max_epochs: int = 1000,
        patience: int = 10,
        min_delta: float = 0.0,
        reduce_lr_on_plateau: bool = False,
        lr_factor: float = 0.2,
        lr_patience: int = 30,
        lr_threshold: float = 1e-4,
        patience_after_lr_drop: Optional[int] = None,
        ema_decay: Optional[float] = None,
        monitor: str = "val_cc_norm",
        mode: str = "max",
        ckpt_path: Optional[Union[str, Path]] = None,
        state_path: Optional[Union[str, Path]] = None,
        log_fn: Callable[[Mapping[str, Any]], None] = _format_epoch,
        track_train_metrics: bool = True,
        track_per_cell_best: bool = False,
    ) -> None:
        if mode not in ("max", "min"):
            raise ValueError(f"mode must be 'max' or 'min', got {mode!r}")
        if patience < 1:
            raise ValueError(f"patience must be >= 1, got {patience}")
        if max_epochs < 1:
            raise ValueError(f"max_epochs must be >= 1, got {max_epochs}")

        self.device = torch.device(device)
        self.model = model.to(self.device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.loss_fn = loss_fn
        self.val_metrics = (
            val_metrics if val_metrics is not None else _default_val_metrics()
        )
        self.optimizer = (
            optimizer
            if optimizer is not None
            else AdamW(self.model.parameters(), lr=1e-3, weight_decay=1e-4)
        )
        self.max_epochs = max_epochs
        self.patience = patience
        self.min_delta = float(min_delta)
        self.reduce_lr_on_plateau = reduce_lr_on_plateau
        self.lr_factor = float(lr_factor)
        self.lr_patience = int(lr_patience)
        self.lr_threshold = float(lr_threshold)
        self.patience_after_lr_drop = patience_after_lr_drop
        self.ema_decay = ema_decay
        self._ema_state: Optional[Dict[str, torch.Tensor]] = None
        self.monitor = monitor
        self.mode = mode
        self.ckpt_path = Path(ckpt_path) if ckpt_path is not None else None
        self.state_path = Path(state_path) if state_path is not None else None
        self.log_fn = log_fn
        self.track_train_metrics = track_train_metrics
        self.track_per_cell_best = track_per_cell_best

    # ------------------------------------------------------------------
    # Hooks (subclass and override, or pass kwargs at construction time)
    # ------------------------------------------------------------------


[docs]
    def compute_loss(
        self, pred: torch.Tensor, responses: torch.Tensor
    ) -> torch.Tensor:
        """Default: delegate to ``self.loss_fn(pred, responses)`` (auto-PSTH inside)."""
        return self.loss_fn(pred, responses)



[docs]
    def on_epoch_end(self, epoch: int, epoch_dict: Dict[str, Any]) -> None:
        """Default: log the epoch dict via ``self.log_fn``."""
        self.log_fn(epoch_dict)


    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------


[docs]
    def fit(self) -> List[Dict[str, Any]]:
        """Train until ``max_epochs`` or early-stop on ``monitor``.

        Returns
        -------
        history : list of dict
            One dict per completed epoch, with keys ``'epoch'``, ``'train_*'``,
            and ``'val_*'``.
        """
        history: List[Dict[str, Any]] = []
        best_score = -float("inf") if self.mode == "max" else float("inf")
        better = (
            (lambda new, best: new > best + self.min_delta)
            if self.mode == "max"
            else (lambda new, best: new < best - self.min_delta)
        )
        epochs_no_improvement = 0
        lr_dropped = False

        if self.track_per_cell_best:
            N = self.model.O
            self._per_cell_best_score = torch.full(
                (N,),
                -float("inf") if self.mode == "max" else float("inf"),
                device=self.device,
            )
            self._per_cell_snapshots: Dict[int, List[torch.Tensor]] = {}

        scheduler = None
        if self.reduce_lr_on_plateau:
            base_lr = self.optimizer.param_groups[0]["lr"]
            scheduler = ReduceLROnPlateau(
                self.optimizer, mode=self.mode, factor=self.lr_factor,
                patience=self.lr_patience, threshold=self.lr_threshold,
                min_lr=base_lr * self.lr_factor,
            )

        # Resume full training state if a checkpoint exists at state_path.
        start_epoch = 0
        if self.state_path is not None and self.state_path.exists():
            st = torch.load(self.state_path, map_location=self.device)
            self.model.load_state_dict(st["model"])
            self.optimizer.load_state_dict(st["optimizer"])
            if scheduler is not None and st.get("scheduler") is not None:
                scheduler.load_state_dict(st["scheduler"])
                # state_dict restores the saved threshold; re-apply the current
                # one so a resume can change the plateau-detection sensitivity.
                scheduler.threshold = self.lr_threshold
            start_epoch = st["epoch"] + 1
            best_score = st["best_score"]
            epochs_no_improvement = st["epochs_no_improvement"]
            history = st["history"]
            lr_dropped = st.get("lr_dropped", False)
            self._ema_state = st.get("ema_state")

        for epoch in range(start_epoch, self.max_epochs):
            train = self._train_one_epoch()
            val = self._evaluate(self.val_loader)
            epoch_dict: Dict[str, Any] = {"epoch": epoch}
            epoch_dict.update({f"train_{k}": v for k, v in train.items()})
            epoch_dict.update({f"val_{k}": v for k, v in val.items()})
            history.append(epoch_dict)
            self.on_epoch_end(epoch, epoch_dict)

            if self.monitor not in epoch_dict:
                raise KeyError(
                    f"monitor key {self.monitor!r} not in epoch dict; "
                    f"available keys: {sorted(epoch_dict)}"
                )

            # Per-cell snapshot update happens BEFORE the global best/patience
            # update so that snapshots track each cell's individual best
            # regardless of population-level early-stop behaviour. Needs the
            # per-cell monitor tensor; raises if it isn't one.
            if self.track_per_cell_best:
                self._update_per_cell_best(epoch_dict[self.monitor])

            score = _to_scalar(epoch_dict[self.monitor])
            if better(score, best_score):
                best_score = score
                if self.ckpt_path is not None:
                    self.ckpt_path.parent.mkdir(parents=True, exist_ok=True)
                    # With EMA on, the monitor was computed on the EMA weights,
                    # so the best checkpoint must hold the EMA weights too.
                    torch.save(
                        self._ema_state if self._ema_state is not None
                        else self.model.state_dict(),
                        self.ckpt_path,
                    )
                epochs_no_improvement = 0
            else:
                epochs_no_improvement += 1

            if scheduler is not None:
                prev_lr = self.optimizer.param_groups[0]["lr"]
                scheduler.step(score)
                if self.optimizer.param_groups[0]["lr"] < prev_lr - 1e-12:
                    # One LR drop happened — give a fresh patience window at the
                    # lower LR before early-stopping.
                    epochs_no_improvement = 0
                    lr_dropped = True

            # Persist full training state (atomically) so a killed run resumes.
            if self.state_path is not None:
                self.state_path.parent.mkdir(parents=True, exist_ok=True)
                tmp = self.state_path.with_suffix(self.state_path.suffix + ".tmp")
                torch.save({
                    "model": self.model.state_dict(),
                    "optimizer": self.optimizer.state_dict(),
                    "scheduler": scheduler.state_dict() if scheduler is not None else None,
                    "epoch": epoch,
                    "best_score": best_score,
                    "epochs_no_improvement": epochs_no_improvement,
                    "history": history,
                    "lr_dropped": lr_dropped,
                    "ema_state": self._ema_state,
                }, tmp)
                tmp.replace(self.state_path)

            # After the (single) LR drop, optionally tighten patience: the model
            # is annealed and near-converged, so it needn't wait as long.
            eff_patience = (
                self.patience_after_lr_drop
                if (self.patience_after_lr_drop is not None and lr_dropped)
                else self.patience
            )
            if epochs_no_improvement >= eff_patience:
                break

        if self.ckpt_path is not None and self.ckpt_path.exists():
            self.model.load_state_dict(
                torch.load(self.ckpt_path, map_location=self.device)
            )

        # Overlay per-cell snapshots on top of the global-ckpt restore.
        # Order is intentional: the global restore resets every parameter
        # (including non-readout ones like the model's core) to the
        # population-best state; the per-cell overlay then replaces each
        # cell's readout slice with its individual-best.
        if self.track_per_cell_best:
            self._restore_per_cell_snapshots()

        # With EMA and no ckpt_path, leave the model at the EMA weights (the
        # ckpt_path branch above already restored the EMA-saved best).
        if (self.ema_decay is not None and self.ckpt_path is None
                and self._ema_state is not None):
            self.model.load_state_dict(self._ema_state)

        return history



[docs]
    def evaluate(self, loader: DataLoader) -> Dict[str, Any]:
        """Run loss + ``val_metrics`` on a loader (no backprop, no key prefix).

        Returns a dict with keys ``'loss'`` plus each entry of ``self.val_metrics``.
        For test-set evaluation after training: ``fitter.evaluate(test_loader)``.
        """
        return self._evaluate(loader)


    # ------------------------------------------------------------------
    # Per-cell snapshot bookkeeping (only used when track_per_cell_best=True)
    # ------------------------------------------------------------------

    def _per_cell_readout_tensors(self):
        """Yield every readout tensor whose leading axis is the neuron axis.

        Iterates both ``parameters()`` and ``buffers()`` under
        ``self.model.readout``, filtering for ``shape[0] == self.model.O``.
        On a no-shared-params readout (STRF kernel + per-neuron BN +
        per-neuron activation, the post-2026-05-19 audio convention)
        this yields every learnable scalar in the readout.
        """
        N = self.model.O
        for p in self.model.readout.parameters():
            if p.dim() >= 1 and p.shape[0] == N:
                yield p
        for b in self.model.readout.buffers():
            # skip 0-d scalar buffers (e.g. BN's num_batches_tracked)
            if b.dim() >= 1 and b.shape[0] == N:
                yield b

    def _update_per_cell_best(self, score: Any) -> None:
        N = self.model.O
        if not isinstance(score, torch.Tensor) or score.shape != (N,):
            raise ValueError(
                f"track_per_cell_best=True requires the {self.monitor!r} "
                f"val metric to return a per-cell tensor of shape ({N},); "
                f"got {type(score).__name__} with shape "
                f"{tuple(score.shape) if isinstance(score, torch.Tensor) else None!r}. "
                f"Use val_metrics callables with reduction='none'."
            )
        score = score.to(self.device)
        better = (
            (lambda new, best: new > best)
            if self.mode == "max"
            else (lambda new, best: new < best)
        )
        improved = better(score, self._per_cell_best_score) & ~score.isnan()
        for n in torch.nonzero(improved, as_tuple=True)[0].tolist():
            self._per_cell_snapshots[n] = [
                p.data[n].detach().clone()
                for p in self._per_cell_readout_tensors()
            ]
        self._per_cell_best_score = torch.where(
            improved, score, self._per_cell_best_score
        )

    def _restore_per_cell_snapshots(self) -> None:
        for n, snap in self._per_cell_snapshots.items():
            for p, s in zip(self._per_cell_readout_tensors(), snap):
                p.data[n] = s

    # ------------------------------------------------------------------
    # Internals
    # ------------------------------------------------------------------

    def _train_one_epoch(self) -> Dict[str, Any]:
        self.model.train()
        loss_sum = 0.0
        n_batches = 0
        preds_list: List[torch.Tensor] = []
        responses_list: List[torch.Tensor] = []

        for batch in self.train_loader:
            stims = batch['stims'].to(self.device)
            responses = batch['responses'].to(self.device)

            self.optimizer.zero_grad()
            pred = self.model(stims)
            loss = self.compute_loss(pred, responses)
            loss.backward()
            self.optimizer.step()
            if hasattr(self.model, "detach"):
                self.model.detach()
            if self.ema_decay is not None:
                self._update_ema()

            loss_sum += float(loss.detach().item())
            n_batches += 1
            # Accumulate on CPU so the concatenated metrics tensor does not
            # have to fit in GPU memory — for large datasets (494 cells × 81
            # train stims × 20 trials × 511 frames on AA2) the GPU concat
            # exceeds typical visible memory by several gigabytes. Skip the
            # accumulation entirely when train-side metrics are disabled —
            # halves wall time on large datasets where users only care
            # about val metrics.
            if self.track_train_metrics:
                preds_list.append(pred.detach().cpu())
                responses_list.append(responses.detach().cpu())

        out: Dict[str, Any] = {"loss": loss_sum / max(n_batches, 1)}
        if self.track_train_metrics:
            with torch.no_grad():
                preds_cat = _pad_and_cat(preds_list)
                responses_cat = _pad_and_cat(responses_list)
                for name, fn in self.val_metrics.items():
                    out[name] = fn(preds_cat, responses_cat)
        return out

    def _update_ema(self) -> None:
        """In-place EMA of the model's full state_dict (params + buffers)."""
        sd = self.model.state_dict()
        if self._ema_state is None:
            self._ema_state = {k: v.detach().clone() for k, v in sd.items()}
            return
        d = self.ema_decay
        for k, v in sd.items():
            e = self._ema_state[k]
            if v.is_floating_point():
                e.mul_(d).add_(v.detach(), alpha=1.0 - d)
            else:
                e.copy_(v)            # integer/bool buffers: track latest

    def _evaluate(self, loader: DataLoader) -> Dict[str, Any]:
        """Evaluate the EMA weights if EMA is on (swap in, eval, restore),
        else the live weights."""
        if self._ema_state is None:
            return self._evaluate_raw(loader)
        backup = {k: v.detach().clone() for k, v in self.model.state_dict().items()}
        self.model.load_state_dict(self._ema_state)
        try:
            return self._evaluate_raw(loader)
        finally:
            self.model.load_state_dict(backup)

    def _evaluate_raw(self, loader: DataLoader) -> Dict[str, Any]:
        self.model.eval()
        preds_list: List[torch.Tensor] = []
        responses_list: List[torch.Tensor] = []

        with torch.no_grad():
            for batch in loader:
                stims = batch['stims'].to(self.device)
                responses = batch['responses'].to(self.device)

                pred = self.model(stims)
                if hasattr(self.model, "detach"):
                    self.model.detach()

                # Same CPU-accumulation as in _train_one_epoch — see comment
                # there for the AA2-scale memory rationale.
                preds_list.append(pred.cpu())
                responses_list.append(responses.cpu())

            preds_cat = _pad_and_cat(preds_list)
            responses_cat = _pad_and_cat(responses_list)

            out: Dict[str, Any] = {
                "loss": float(self.compute_loss(preds_cat, responses_cat).item()),
            }
            for name, fn in self.val_metrics.items():
                out[name] = fn(preds_cat, responses_cat)
        return out