508 lines
17 KiB
Python
508 lines
17 KiB
Python
"""Interfaces to be implemented by general evaluators."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from abc import ABC, abstractmethod
|
|
from collections.abc import Sequence
|
|
from enum import Enum
|
|
from typing import Any
|
|
from warnings import warn
|
|
|
|
from langchain_core.agents import AgentAction
|
|
from langchain_core.language_models import BaseLanguageModel
|
|
from langchain_core.runnables.config import run_in_executor
|
|
|
|
from langchain_classic.chains.base import Chain
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class EvaluatorType(str, Enum):
|
|
"""The types of the evaluators."""
|
|
|
|
QA = "qa"
|
|
"""Question answering evaluator, which grades answers to questions
|
|
directly using an LLM."""
|
|
COT_QA = "cot_qa"
|
|
"""Chain of thought question answering evaluator, which grades
|
|
answers to questions using
|
|
chain of thought 'reasoning'."""
|
|
CONTEXT_QA = "context_qa"
|
|
"""Question answering evaluator that incorporates 'context' in the response."""
|
|
PAIRWISE_STRING = "pairwise_string"
|
|
"""The pairwise string evaluator, which predicts the preferred prediction from
|
|
between two models."""
|
|
SCORE_STRING = "score_string"
|
|
"""The scored string evaluator, which gives a score between 1 and 10
|
|
to a prediction."""
|
|
LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
|
|
"""The labeled pairwise string evaluator, which predicts the preferred prediction
|
|
from between two models based on a ground truth reference label."""
|
|
LABELED_SCORE_STRING = "labeled_score_string"
|
|
"""The labeled scored string evaluator, which gives a score between 1 and 10
|
|
to a prediction based on a ground truth reference label."""
|
|
AGENT_TRAJECTORY = "trajectory"
|
|
"""The agent trajectory evaluator, which grades the agent's intermediate steps."""
|
|
CRITERIA = "criteria"
|
|
"""The criteria evaluator, which evaluates a model based on a
|
|
custom set of criteria without any reference labels."""
|
|
LABELED_CRITERIA = "labeled_criteria"
|
|
"""The labeled criteria evaluator, which evaluates a model based on a
|
|
custom set of criteria, with a reference label."""
|
|
STRING_DISTANCE = "string_distance"
|
|
"""Compare predictions to a reference answer using string edit distances."""
|
|
EXACT_MATCH = "exact_match"
|
|
"""Compare predictions to a reference answer using exact matching."""
|
|
REGEX_MATCH = "regex_match"
|
|
"""Compare predictions to a reference answer using regular expressions."""
|
|
PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
|
|
"""Compare predictions based on string edit distances."""
|
|
EMBEDDING_DISTANCE = "embedding_distance"
|
|
"""Compare a prediction to a reference label using embedding distance."""
|
|
PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
|
|
"""Compare two predictions using embedding distance."""
|
|
JSON_VALIDITY = "json_validity"
|
|
"""Check if a prediction is valid JSON."""
|
|
JSON_EQUALITY = "json_equality"
|
|
"""Check if a prediction is equal to a reference JSON."""
|
|
JSON_EDIT_DISTANCE = "json_edit_distance"
|
|
"""Compute the edit distance between two JSON strings after canonicalization."""
|
|
JSON_SCHEMA_VALIDATION = "json_schema_validation"
|
|
"""Check if a prediction is valid JSON according to a JSON schema."""
|
|
|
|
|
|
class LLMEvalChain(Chain):
|
|
"""A base class for evaluators that use an LLM."""
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
def from_llm(cls, llm: BaseLanguageModel, **kwargs: Any) -> LLMEvalChain:
|
|
"""Create a new evaluator from an LLM."""
|
|
|
|
|
|
class _EvalArgsMixin:
|
|
"""Mixin for checking evaluation arguments."""
|
|
|
|
@property
|
|
def requires_reference(self) -> bool:
|
|
"""Whether this evaluator requires a reference label."""
|
|
return False
|
|
|
|
@property
|
|
def requires_input(self) -> bool:
|
|
"""Whether this evaluator requires an input string."""
|
|
return False
|
|
|
|
@property
|
|
def _skip_input_warning(self) -> str:
|
|
"""Warning to show when input is ignored."""
|
|
return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
|
|
|
|
@property
|
|
def _skip_reference_warning(self) -> str:
|
|
"""Warning to show when reference is ignored."""
|
|
return (
|
|
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
|
|
)
|
|
|
|
def _check_evaluation_args(
|
|
self,
|
|
reference: str | None = None,
|
|
input_: str | None = None,
|
|
) -> None:
|
|
"""Check if the evaluation arguments are valid.
|
|
|
|
Args:
|
|
reference: The reference label.
|
|
input_: The input string.
|
|
|
|
Raises:
|
|
ValueError: If the evaluator requires an input string but none is provided,
|
|
or if the evaluator requires a reference label but none is provided.
|
|
"""
|
|
if self.requires_input and input_ is None:
|
|
msg = f"{self.__class__.__name__} requires an input string."
|
|
raise ValueError(msg)
|
|
if input_ is not None and not self.requires_input:
|
|
warn(self._skip_input_warning, stacklevel=3)
|
|
if self.requires_reference and reference is None:
|
|
msg = f"{self.__class__.__name__} requires a reference string."
|
|
raise ValueError(msg)
|
|
if reference is not None and not self.requires_reference:
|
|
warn(self._skip_reference_warning, stacklevel=3)
|
|
|
|
|
|
class StringEvaluator(_EvalArgsMixin, ABC):
|
|
"""String evaluator interface.
|
|
|
|
Grade, tag, or otherwise evaluate predictions relative to their inputs
|
|
and/or reference labels.
|
|
"""
|
|
|
|
@property
|
|
def evaluation_name(self) -> str:
|
|
"""The name of the evaluation."""
|
|
return self.__class__.__name__
|
|
|
|
@property
|
|
def requires_reference(self) -> bool:
|
|
"""Whether this evaluator requires a reference label."""
|
|
return False
|
|
|
|
@abstractmethod
|
|
def _evaluate_strings(
|
|
self,
|
|
*,
|
|
prediction: str | Any,
|
|
reference: str | Any | None = None,
|
|
input: str | Any | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Evaluate Chain or LLM output, based on optional input and label.
|
|
|
|
Args:
|
|
prediction: The LLM or chain prediction to evaluate.
|
|
reference: The reference label to evaluate against.
|
|
input: The input to consider during evaluation.
|
|
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
|
|
|
|
Returns:
|
|
The evaluation results containing the score or value.
|
|
It is recommended that the dictionary contain the following keys:
|
|
- score: the score of the evaluation, if applicable.
|
|
- value: the string value of the evaluation, if applicable.
|
|
- reasoning: the reasoning for the evaluation, if applicable.
|
|
"""
|
|
|
|
async def _aevaluate_strings(
|
|
self,
|
|
*,
|
|
prediction: str | Any,
|
|
reference: str | Any | None = None,
|
|
input: str | Any | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Asynchronously evaluate Chain or LLM output, based on optional input and label.
|
|
|
|
Args:
|
|
prediction: The LLM or chain prediction to evaluate.
|
|
reference: The reference label to evaluate against.
|
|
input: The input to consider during evaluation.
|
|
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
|
|
|
|
Returns:
|
|
The evaluation results containing the score or value.
|
|
It is recommended that the dictionary contain the following keys:
|
|
- score: the score of the evaluation, if applicable.
|
|
- value: the string value of the evaluation, if applicable.
|
|
- reasoning: the reasoning for the evaluation, if applicable.
|
|
""" # noqa: E501
|
|
return await run_in_executor(
|
|
None,
|
|
self._evaluate_strings,
|
|
prediction=prediction,
|
|
reference=reference,
|
|
input=input,
|
|
**kwargs,
|
|
)
|
|
|
|
def evaluate_strings(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
reference: str | None = None,
|
|
input: str | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Evaluate Chain or LLM output, based on optional input and label.
|
|
|
|
Args:
|
|
prediction: The LLM or chain prediction to evaluate.
|
|
reference: The reference label to evaluate against.
|
|
input: The input to consider during evaluation.
|
|
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
|
|
|
|
Returns:
|
|
The evaluation results containing the score or value.
|
|
"""
|
|
self._check_evaluation_args(reference=reference, input_=input)
|
|
return self._evaluate_strings(
|
|
prediction=prediction,
|
|
reference=reference,
|
|
input=input,
|
|
**kwargs,
|
|
)
|
|
|
|
async def aevaluate_strings(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
reference: str | None = None,
|
|
input: str | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Asynchronously evaluate Chain or LLM output, based on optional input and label.
|
|
|
|
Args:
|
|
prediction: The LLM or chain prediction to evaluate.
|
|
reference: The reference label to evaluate against.
|
|
input: The input to consider during evaluation.
|
|
**kwargs: Additional keyword arguments, including callbacks, tags, etc.
|
|
|
|
Returns:
|
|
The evaluation results containing the score or value.
|
|
""" # noqa: E501
|
|
self._check_evaluation_args(reference=reference, input_=input)
|
|
return await self._aevaluate_strings(
|
|
prediction=prediction,
|
|
reference=reference,
|
|
input=input,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
|
|
"""Compare the output of two models (or two outputs of the same model)."""
|
|
|
|
@abstractmethod
|
|
def _evaluate_string_pairs(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
prediction_b: str,
|
|
reference: str | None = None,
|
|
input: str | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Evaluate the output string pairs.
|
|
|
|
Args:
|
|
prediction: The output string from the first model.
|
|
prediction_b: The output string from the second model.
|
|
reference: The expected output / reference string.
|
|
input: The input string.
|
|
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
|
|
|
|
Returns:
|
|
`dict` containing the preference, scores, and/or other information.
|
|
""" # noqa: E501
|
|
|
|
async def _aevaluate_string_pairs(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
prediction_b: str,
|
|
reference: str | None = None,
|
|
input: str | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Asynchronously evaluate the output string pairs.
|
|
|
|
Args:
|
|
prediction: The output string from the first model.
|
|
prediction_b: The output string from the second model.
|
|
reference: The expected output / reference string.
|
|
input: The input string.
|
|
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
|
|
|
|
Returns:
|
|
`dict` containing the preference, scores, and/or other information.
|
|
""" # noqa: E501
|
|
return await run_in_executor(
|
|
None,
|
|
self._evaluate_string_pairs,
|
|
prediction=prediction,
|
|
prediction_b=prediction_b,
|
|
reference=reference,
|
|
input=input,
|
|
**kwargs,
|
|
)
|
|
|
|
def evaluate_string_pairs(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
prediction_b: str,
|
|
reference: str | None = None,
|
|
input: str | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Evaluate the output string pairs.
|
|
|
|
Args:
|
|
prediction: The output string from the first model.
|
|
prediction_b: The output string from the second model.
|
|
reference: The expected output / reference string.
|
|
input: The input string.
|
|
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
|
|
|
|
Returns:
|
|
`dict` containing the preference, scores, and/or other information.
|
|
""" # noqa: E501
|
|
self._check_evaluation_args(reference=reference, input_=input)
|
|
return self._evaluate_string_pairs(
|
|
prediction=prediction,
|
|
prediction_b=prediction_b,
|
|
reference=reference,
|
|
input=input,
|
|
**kwargs,
|
|
)
|
|
|
|
async def aevaluate_string_pairs(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
prediction_b: str,
|
|
reference: str | None = None,
|
|
input: str | None = None, # noqa: A002
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Asynchronously evaluate the output string pairs.
|
|
|
|
Args:
|
|
prediction: The output string from the first model.
|
|
prediction_b: The output string from the second model.
|
|
reference: The expected output / reference string.
|
|
input: The input string.
|
|
**kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
|
|
|
|
Returns:
|
|
`dict` containing the preference, scores, and/or other information.
|
|
""" # noqa: E501
|
|
self._check_evaluation_args(reference=reference, input_=input)
|
|
return await self._aevaluate_string_pairs(
|
|
prediction=prediction,
|
|
prediction_b=prediction_b,
|
|
reference=reference,
|
|
input=input,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
|
|
"""Interface for evaluating agent trajectories."""
|
|
|
|
@property
|
|
def requires_input(self) -> bool:
|
|
"""Whether this evaluator requires an input string."""
|
|
return True
|
|
|
|
@abstractmethod
|
|
def _evaluate_agent_trajectory(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
agent_trajectory: Sequence[tuple[AgentAction, str]],
|
|
input: str, # noqa: A002
|
|
reference: str | None = None,
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Evaluate a trajectory.
|
|
|
|
Args:
|
|
prediction: The final predicted response.
|
|
agent_trajectory:
|
|
The intermediate steps forming the agent trajectory.
|
|
input: The input to the agent.
|
|
reference: The reference answer.
|
|
**kwargs: Additional keyword arguments.
|
|
|
|
Returns:
|
|
The evaluation result.
|
|
"""
|
|
|
|
async def _aevaluate_agent_trajectory(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
agent_trajectory: Sequence[tuple[AgentAction, str]],
|
|
input: str, # noqa: A002
|
|
reference: str | None = None,
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Asynchronously evaluate a trajectory.
|
|
|
|
Args:
|
|
prediction: The final predicted response.
|
|
agent_trajectory:
|
|
The intermediate steps forming the agent trajectory.
|
|
input: The input to the agent.
|
|
reference: The reference answer.
|
|
**kwargs: Additional keyword arguments.
|
|
|
|
Returns:
|
|
The evaluation result.
|
|
"""
|
|
return await run_in_executor(
|
|
None,
|
|
self._evaluate_agent_trajectory,
|
|
prediction=prediction,
|
|
agent_trajectory=agent_trajectory,
|
|
reference=reference,
|
|
input=input,
|
|
**kwargs,
|
|
)
|
|
|
|
def evaluate_agent_trajectory(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
agent_trajectory: Sequence[tuple[AgentAction, str]],
|
|
input: str, # noqa: A002
|
|
reference: str | None = None,
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Evaluate a trajectory.
|
|
|
|
Args:
|
|
prediction: The final predicted response.
|
|
agent_trajectory:
|
|
The intermediate steps forming the agent trajectory.
|
|
input: The input to the agent.
|
|
reference: The reference answer.
|
|
**kwargs: Additional keyword arguments.
|
|
|
|
Returns:
|
|
The evaluation result.
|
|
"""
|
|
self._check_evaluation_args(reference=reference, input_=input)
|
|
return self._evaluate_agent_trajectory(
|
|
prediction=prediction,
|
|
input=input,
|
|
agent_trajectory=agent_trajectory,
|
|
reference=reference,
|
|
**kwargs,
|
|
)
|
|
|
|
async def aevaluate_agent_trajectory(
|
|
self,
|
|
*,
|
|
prediction: str,
|
|
agent_trajectory: Sequence[tuple[AgentAction, str]],
|
|
input: str, # noqa: A002
|
|
reference: str | None = None,
|
|
**kwargs: Any,
|
|
) -> dict:
|
|
"""Asynchronously evaluate a trajectory.
|
|
|
|
Args:
|
|
prediction: The final predicted response.
|
|
agent_trajectory:
|
|
The intermediate steps forming the agent trajectory.
|
|
input: The input to the agent.
|
|
reference: The reference answer.
|
|
**kwargs: Additional keyword arguments.
|
|
|
|
Returns:
|
|
The evaluation result.
|
|
"""
|
|
self._check_evaluation_args(reference=reference, input_=input)
|
|
return await self._aevaluate_agent_trajectory(
|
|
prediction=prediction,
|
|
input=input,
|
|
agent_trajectory=agent_trajectory,
|
|
reference=reference,
|
|
**kwargs,
|
|
)
|