Source code for torch_measure.models.ncf

# Copyright (c) 2026 AIMS Foundations. MIT License.

"""Neural Collaborative Filter (NCF) that predicts response matrix entries."""

import math

import numpy as np
import torch
import torch.nn as nn
from scipy.optimize import minimize
from scipy.special import expit as sigmoid
from sentence_transformers import SentenceTransformer

from torch_measure.models._network import MLP



[docs]
class NCF(nn.Module):
    """Neural Collaborative Filter predictive model.

    A neural network model to predict response matrix entries.

    Architecture:
    - Sentence embeddings for both subject and item content
    - Small MLP head trained offline on training data

    Parameters
    ----------
    encoder : SentenceTransformer
        Pre-trained transformer model used to embed subject and item content.
    embedding_dim : int
        Output dimension of the encoder model.
    encode_batch_size : int
        Batch size used to embed subject and item content.
    hidden_dim : int
        Dimension of hidden layers.
    n_layers : int
        Number of layers (minimum 1).
    dropout : float
        Dropout rate between layers.
    device : str
        Device to place parameters on.
    """

    def __init__(
        self,
        encoder: SentenceTransformer,
        embedding_dim: int,
        encode_batch_size: int = 256,
        hidden_dim: int = 256,
        n_layers: int = 3,
        dropout: float = 0.1,
        device: str = "cpu",
    ) -> None:
        super().__init__()
        self.embedding_dim = embedding_dim
        self.encode_batch_size = encode_batch_size
        self._device = device

        self.encoder = encoder
        self.net = NCFHead(
            input_dim=embedding_dim * 2,
            hidden_dim=hidden_dim,
            n_layers=n_layers,
            dropout=dropout,
        ).to(self._device)

        # Calibration
        self._platt_a = 1.0
        self._platt_b = 0.0
        self._round_calibrated = False  # reset each round

    def _encode_single(self, subject: str, item: str) -> torch.Tensor:
        """Encode a subject-item pair."""
        u = self.encoder.encode(subject, convert_to_tensor=True, device=self._device)
        v = self.encoder.encode(item, convert_to_tensor=True, device=self._device)
        return u, v

    def _raw_prob(self, subject: str, item: str) -> float:
        """Forward pass through the NCF, returns probability in [0, 1]."""
        with torch.no_grad():
            u, v = self._encode_single(subject, item)
            x = torch.cat([u, v], dim=-1).unsqueeze(0)
            logit = self.net(x).squeeze(-1).item()
        return float(1.0 / (1.0 + math.exp(-logit)))

    def _fit_platt(self, labeled: list[dict]) -> None:
        """
        Fit a one-parameter Platt scaler on revealed labels.
        Uses scipy to optimise log-loss of:
            p_calibrated = sigmoid(a * logit + b)
        where logit = logit(_raw_prob(...)).
        """
        if not labeled:
            return

        logits, ys = [], []
        for ex in labeled:
            p = self._raw_prob(ex["subject_content"], ex["item_content"])
            p = float(np.clip(p, 1e-7, 1 - 1e-7))
            logits.append(math.log(p / (1 - p)))
            ys.append(float(ex["label"]))
        logits = np.array(logits)
        ys = np.array(ys)

        def neg_log_loss(params):
            a, b = params
            probs = sigmoid(a * logits + b)
            probs = np.clip(probs, 1e-7, 1 - 1e-7)
            return -np.mean(ys * np.log(probs) + (1 - ys) * np.log(1 - probs))

        result = minimize(neg_log_loss, x0=[1.0, 0.0], method="L-BFGS-B")
        if result.success:
            self._platt_a, self._platt_b = result.x
        self._round_calibrated = True


[docs]
    def encode_batch(self, subjects: list[str], items: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
        """Encode a batch of subject-item pairs."""
        u = self.encoder.encode(
            subjects,
            convert_to_tensor=True,
            batch_size=self.encode_batch_size,
            show_progress_bar=True,
            device=self._device,
        )
        v = self.encoder.encode(
            items,
            convert_to_tensor=True,
            batch_size=self.encode_batch_size,
            show_progress_bar=True,
            device=self._device,
        )
        return u, v



[docs]
    def load_head(self, path: str) -> None:
        """Load pre-trained NCFHead weights from a state dict file."""
        state = torch.load(path, map_location=self._device, weights_only=True)
        self.net.load_state_dict(state)



[docs]
    def load_embeddings(self, path: str) -> tuple[torch.Tensor, torch.Tensor]:
        """Load pre-computed subject and item embeddings from a checkpoint file.

        Parameters
        ----------
        path : str
            Path to the embeddings checkpoint saved by ``torch.save`` with keys
            ``"subject_embeddings"`` and ``"item_embeddings"``.

        Returns
        -------
        tuple[torch.Tensor, torch.Tensor]
            Subject embeddings and item embeddings, respectively.
        """
        data = torch.load(path, weights_only=True)
        return data["subject_embeddings"], data["item_embeddings"]



[docs]
    def predict(self, data: dict, labeled: list[dict]) -> float:
        """Compute response probability P(subject passes item).

        1. Compute raw NCF probability.
        2. On first call of a round with labels available, fit Platt scaler.
        3. Apply calibrated scaling and return.

        Parameters
        ----------
        data : dict
            Dictionary with keys ``"subject_content"`` (str) and
            ``"item_content"`` (str) containing the raw text for the subject
            and item to score.
        labeled : list[dict]
            Previously observed subject-item-response records.

        Returns
        -------
        float
            Predicted probability that the subject passes the item, clipped to
            ``[1e-7, 1 - 1e-7]``.
        """
        # Fit Platt scaler once per round (on first call with labeled data)
        if labeled and not self._round_calibrated:
            self._fit_platt(labeled)

        raw_p = self._raw_prob(data["subject_content"], data["item_content"])
        raw_p = float(np.clip(raw_p, 1e-7, 1 - 1e-7))

        if not self._round_calibrated:
            return raw_p

        # Apply Platt calibration in log-odds space
        raw_logit = math.log(raw_p / (1 - raw_p))
        cal_logit = self._platt_a * raw_logit + self._platt_b
        return float(1.0 / (1.0 + math.exp(-cal_logit)))




class NCFHead(nn.Module):
    """Neural Collaborative Filter Multi-Layer Perceptron head.

    Maps sentence embeddings to a unidimensional output.

    Parameters
    ----------
    input_dim : int
        Dimension of the input (concatenated subject and item embeddings).
    hidden_dim : int
        Dimension of hidden layers.
    n_layers : int
        Number of layers (minimum 1).
    dropout : float
        Dropout rate between layers.
    """

    def __init__(
        self,
        input_dim: int,
        hidden_dim: int = 256,
        n_layers: int = 3,
        dropout: float = 0.1,
    ) -> None:
        super().__init__()

        self.net = MLP(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            output_dim=1,
            n_layers=n_layers,
            dropout=dropout,
        )

    def forward(self, x):
        """Forward pass."""
        return self.net(x).squeeze(-1)