Source code for torch_measure.models.llm_judge

# Copyright (c) 2026 AIMS Foundations. MIT License.

"""LLM-as-judge predictive model using next-token yes/no probabilities."""

import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

_JUDGE_TEMPLATE = (
    "You will see a description of an AI subject and an"
    " evaluation item. Decide whether the subject would answer the item"
    " correctly. Reply with a single token: yes or no."
    "\n    Benchmark: {benchmark}"
    "\n    Condition: {condition}"
    "\n    Subject: {subject_content}"
    "\n    Item: {item_content}"
    "\n    Answer:"
)


[docs] class LLMJudge: """LLM-as-judge predictive model. Uses the next-token yes/no log-probability ratio from a causal language model to predict whether a subject would answer an item correctly. Optionally prepends same-subject in-context examples from ``labeled``. Parameters ---------- model_id : str HuggingFace model identifier. max_icl : int Maximum number of same-subject labeled examples to prepend as in-context demonstrations. batch_size : int Batch size for LLM inference. device : str Device passed to ``device_map``. Use ``"auto"`` for multi-GPU. """ def __init__( self, model_id: str = "Qwen/Qwen2-7B-Instruct", max_icl: int = 5, batch_size: int = 32, device: str = "auto", ) -> None: self.max_icl = max_icl self.batch_size = batch_size self.tokenizer = AutoTokenizer.from_pretrained(model_id) self.model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map=device, attn_implementation="sdpa", ) self.model.eval() if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer.padding_side = "left" self._yes_id = self.tokenizer.encode(" yes", add_special_tokens=False)[-1] self._no_id = self.tokenizer.encode(" no", add_special_tokens=False)[-1] def _build_prompt(self, data: dict, labeled: list[dict] | None = None) -> str: """Build the judge prompt, optionally with same-subject ICL examples.""" if labeled: same_subj = [ex for ex in labeled if ex["subject_content"] == data["subject_content"]][-self.max_icl :] if same_subj: icl = "\n\n".join( _JUDGE_TEMPLATE.format( benchmark=ex.get("benchmark", ""), condition=ex.get("condition", ""), subject_content=ex["subject_content"], item_content=ex["item_content"], ) + (" yes" if ex["label"] >= 0.5 else " no") for ex in same_subj ) return icl + "\n\n" + _JUDGE_TEMPLATE.format(**data) return _JUDGE_TEMPLATE.format(**data) def _batch_probs(self, prompts: list[str]) -> list[float]: """Run a batch of prompts; return p_yes / (p_yes + p_no) for each.""" ids = self.tokenizer( prompts, return_tensors="pt", padding=True, truncation=True, ).to(self.model.device) with torch.no_grad(): logits = self.model(**ids).logits[:, -1, :] lp = torch.log_softmax(logits, dim=-1) p_yes = lp[:, self._yes_id].exp() p_no = lp[:, self._no_id].exp() return (p_yes / (p_yes + p_no)).tolist()
[docs] def predict(self, data: dict, labeled: list[dict] | None = None) -> float: """Compute response probability P(subject passes item). Parameters ---------- data : dict Dictionary with keys ``"subject_content"``, ``"item_content"``, ``"benchmark"``, and ``"condition"``. labeled : list[dict] or None Previously observed subject-item-response records with keys ``"subject_content"``, ``"item_content"``, ``"benchmark"``, ``"condition"``, and ``"label"`` (float in [0, 1]). Same-subject records are prepended as in-context examples. Returns ------- float Predicted probability that the subject passes the item, clipped to ``[1e-7, 1 - 1e-7]``. """ prompt = self._build_prompt(data, labeled) prob = self._batch_probs([prompt])[0] return float(np.clip(prob, 1e-7, 1 - 1e-7))