Source code for torch_measure.models.llm_judge

# Copyright (c) 2026 AIMS Foundations. MIT License.

"""LLM-as-judge predictive model using next-token yes/no probabilities."""

import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

_JUDGE_TEMPLATE = (
    "You will see a description of an AI subject and an"
    " evaluation item. Decide whether the subject would answer the item"
    " correctly. Reply with a single token: yes or no."
    "\n    Benchmark: {benchmark}"
    "\n    Condition: {condition}"
    "\n    Subject: {subject_content}"
    "\n    Item: {item_content}"
    "\n    Answer:"
)



[docs]
class LLMJudge:
    """LLM-as-judge predictive model.

    Uses the next-token yes/no log-probability ratio from a causal language
    model to predict whether a subject would answer an item correctly.
    Optionally prepends same-subject in-context examples from ``labeled``.

    Parameters
    ----------
    model_id : str
        HuggingFace model identifier.
    max_icl : int
        Maximum number of same-subject labeled examples to prepend as
        in-context demonstrations.
    batch_size : int
        Batch size for LLM inference.
    device : str
        Device passed to ``device_map``. Use ``"auto"`` for multi-GPU.
    """

    def __init__(
        self,
        model_id: str = "Qwen/Qwen2-7B-Instruct",
        max_icl: int = 5,
        batch_size: int = 32,
        device: str = "auto",
    ) -> None:
        self.max_icl = max_icl
        self.batch_size = batch_size

        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map=device,
            attn_implementation="sdpa",
        )
        self.model.eval()

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        self._yes_id = self.tokenizer.encode(" yes", add_special_tokens=False)[-1]
        self._no_id = self.tokenizer.encode(" no", add_special_tokens=False)[-1]

    def _build_prompt(self, data: dict, labeled: list[dict] | None = None) -> str:
        """Build the judge prompt, optionally with same-subject ICL examples."""
        if labeled:
            same_subj = [ex for ex in labeled if ex["subject_content"] == data["subject_content"]][-self.max_icl :]
            if same_subj:
                icl = "\n\n".join(
                    _JUDGE_TEMPLATE.format(
                        benchmark=ex.get("benchmark", ""),
                        condition=ex.get("condition", ""),
                        subject_content=ex["subject_content"],
                        item_content=ex["item_content"],
                    )
                    + (" yes" if ex["label"] >= 0.5 else " no")
                    for ex in same_subj
                )
                return icl + "\n\n" + _JUDGE_TEMPLATE.format(**data)
        return _JUDGE_TEMPLATE.format(**data)

    def _batch_probs(self, prompts: list[str]) -> list[float]:
        """Run a batch of prompts; return p_yes / (p_yes + p_no) for each."""
        ids = self.tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        ).to(self.model.device)
        with torch.no_grad():
            logits = self.model(**ids).logits[:, -1, :]
        lp = torch.log_softmax(logits, dim=-1)
        p_yes = lp[:, self._yes_id].exp()
        p_no = lp[:, self._no_id].exp()
        return (p_yes / (p_yes + p_no)).tolist()


[docs]
    def predict(self, data: dict, labeled: list[dict] | None = None) -> float:
        """Compute response probability P(subject passes item).

        Parameters
        ----------
        data : dict
            Dictionary with keys ``"subject_content"``, ``"item_content"``,
            ``"benchmark"``, and ``"condition"``.
        labeled : list[dict] or None
            Previously observed subject-item-response records with keys
            ``"subject_content"``, ``"item_content"``, ``"benchmark"``,
            ``"condition"``, and ``"label"`` (float in [0, 1]).  Same-subject
            records are prepended as in-context examples.

        Returns
        -------
        float
            Predicted probability that the subject passes the item, clipped to
            ``[1e-7, 1 - 1e-7]``.
        """
        prompt = self._build_prompt(data, labeled)
        prob = self._batch_probs([prompt])[0]
        return float(np.clip(prob, 1e-7, 1 - 1e-7))