220 lines
7.4 KiB
Python
220 lines
7.4 KiB
Python
|
|
"""Loading datasets and evaluators."""
|
||
|
|
|
||
|
|
from collections.abc import Sequence
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from langchain_core.language_models import BaseLanguageModel
|
||
|
|
|
||
|
|
from langchain_classic.chains.base import Chain
|
||
|
|
from langchain_classic.evaluation.agents.trajectory_eval_chain import (
|
||
|
|
TrajectoryEvalChain,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.comparison import PairwiseStringEvalChain
|
||
|
|
from langchain_classic.evaluation.comparison.eval_chain import (
|
||
|
|
LabeledPairwiseStringEvalChain,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.criteria.eval_chain import (
|
||
|
|
CriteriaEvalChain,
|
||
|
|
LabeledCriteriaEvalChain,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.embedding_distance.base import (
|
||
|
|
EmbeddingDistanceEvalChain,
|
||
|
|
PairwiseEmbeddingDistanceEvalChain,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.exact_match.base import ExactMatchStringEvaluator
|
||
|
|
from langchain_classic.evaluation.parsing.base import (
|
||
|
|
JsonEqualityEvaluator,
|
||
|
|
JsonValidityEvaluator,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.parsing.json_distance import JsonEditDistanceEvaluator
|
||
|
|
from langchain_classic.evaluation.parsing.json_schema import JsonSchemaEvaluator
|
||
|
|
from langchain_classic.evaluation.qa import (
|
||
|
|
ContextQAEvalChain,
|
||
|
|
CotQAEvalChain,
|
||
|
|
QAEvalChain,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.regex_match.base import RegexMatchStringEvaluator
|
||
|
|
from langchain_classic.evaluation.schema import (
|
||
|
|
EvaluatorType,
|
||
|
|
LLMEvalChain,
|
||
|
|
StringEvaluator,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.scoring.eval_chain import (
|
||
|
|
LabeledScoreStringEvalChain,
|
||
|
|
ScoreStringEvalChain,
|
||
|
|
)
|
||
|
|
from langchain_classic.evaluation.string_distance.base import (
|
||
|
|
PairwiseStringDistanceEvalChain,
|
||
|
|
StringDistanceEvalChain,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def load_dataset(uri: str) -> list[dict]:
|
||
|
|
"""Load a dataset from the [LangChainDatasets on HuggingFace](https://huggingface.co/LangChainDatasets).
|
||
|
|
|
||
|
|
Args:
|
||
|
|
uri: The uri of the dataset to load.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
A list of dictionaries, each representing a row in the dataset.
|
||
|
|
|
||
|
|
**Prerequisites**
|
||
|
|
|
||
|
|
```bash
|
||
|
|
pip install datasets
|
||
|
|
```
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
--------
|
||
|
|
```python
|
||
|
|
from langchain_classic.evaluation import load_dataset
|
||
|
|
|
||
|
|
ds = load_dataset("llm-math")
|
||
|
|
```
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
from datasets import load_dataset
|
||
|
|
except ImportError as e:
|
||
|
|
msg = (
|
||
|
|
"load_dataset requires the `datasets` package."
|
||
|
|
" Please install with `pip install datasets`"
|
||
|
|
)
|
||
|
|
raise ImportError(msg) from e
|
||
|
|
|
||
|
|
dataset = load_dataset(f"LangChainDatasets/{uri}")
|
||
|
|
return list(dataset["train"])
|
||
|
|
|
||
|
|
|
||
|
|
_EVALUATOR_MAP: dict[
|
||
|
|
EvaluatorType,
|
||
|
|
type[LLMEvalChain] | type[Chain] | type[StringEvaluator],
|
||
|
|
] = {
|
||
|
|
EvaluatorType.QA: QAEvalChain,
|
||
|
|
EvaluatorType.COT_QA: CotQAEvalChain,
|
||
|
|
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
|
||
|
|
EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
|
||
|
|
EvaluatorType.SCORE_STRING: ScoreStringEvalChain,
|
||
|
|
EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
|
||
|
|
EvaluatorType.LABELED_SCORE_STRING: LabeledScoreStringEvalChain,
|
||
|
|
EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
|
||
|
|
EvaluatorType.CRITERIA: CriteriaEvalChain,
|
||
|
|
EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,
|
||
|
|
EvaluatorType.STRING_DISTANCE: StringDistanceEvalChain,
|
||
|
|
EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
|
||
|
|
EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,
|
||
|
|
EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
|
||
|
|
EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator,
|
||
|
|
EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator,
|
||
|
|
EvaluatorType.JSON_EDIT_DISTANCE: JsonEditDistanceEvaluator,
|
||
|
|
EvaluatorType.JSON_SCHEMA_VALIDATION: JsonSchemaEvaluator,
|
||
|
|
EvaluatorType.REGEX_MATCH: RegexMatchStringEvaluator,
|
||
|
|
EvaluatorType.EXACT_MATCH: ExactMatchStringEvaluator,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def load_evaluator(
|
||
|
|
evaluator: EvaluatorType,
|
||
|
|
*,
|
||
|
|
llm: BaseLanguageModel | None = None,
|
||
|
|
**kwargs: Any,
|
||
|
|
) -> Chain | StringEvaluator:
|
||
|
|
"""Load the requested evaluation chain specified by a string.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
evaluator : EvaluatorType
|
||
|
|
The type of evaluator to load.
|
||
|
|
llm : BaseLanguageModel, optional
|
||
|
|
The language model to use for evaluation, by default None
|
||
|
|
**kwargs : Any
|
||
|
|
Additional keyword arguments to pass to the evaluator.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
-------
|
||
|
|
Chain
|
||
|
|
The loaded evaluation chain.
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
--------
|
||
|
|
>>> from langchain_classic.evaluation import load_evaluator, EvaluatorType
|
||
|
|
>>> evaluator = load_evaluator(EvaluatorType.QA)
|
||
|
|
"""
|
||
|
|
if evaluator not in _EVALUATOR_MAP:
|
||
|
|
msg = (
|
||
|
|
f"Unknown evaluator type: {evaluator}"
|
||
|
|
f"\nValid types are: {list(_EVALUATOR_MAP.keys())}"
|
||
|
|
)
|
||
|
|
raise ValueError(msg)
|
||
|
|
evaluator_cls = _EVALUATOR_MAP[evaluator]
|
||
|
|
if issubclass(evaluator_cls, LLMEvalChain):
|
||
|
|
try:
|
||
|
|
try:
|
||
|
|
from langchain_openai import ChatOpenAI
|
||
|
|
except ImportError:
|
||
|
|
try:
|
||
|
|
from langchain_community.chat_models.openai import ( # type: ignore[no-redef]
|
||
|
|
ChatOpenAI,
|
||
|
|
)
|
||
|
|
except ImportError as e:
|
||
|
|
msg = (
|
||
|
|
"Could not import langchain_openai or fallback onto "
|
||
|
|
"langchain_community. Please install langchain_openai "
|
||
|
|
"or specify a language model explicitly. "
|
||
|
|
"It's recommended to install langchain_openai AND "
|
||
|
|
"specify a language model explicitly."
|
||
|
|
)
|
||
|
|
raise ImportError(msg) from e
|
||
|
|
|
||
|
|
llm = llm or ChatOpenAI(model="gpt-4", seed=42, temperature=0)
|
||
|
|
except Exception as e:
|
||
|
|
msg = (
|
||
|
|
f"Evaluation with the {evaluator_cls} requires a "
|
||
|
|
"language model to function."
|
||
|
|
" Failed to create the default 'gpt-4' model."
|
||
|
|
" Please manually provide an evaluation LLM"
|
||
|
|
" or check your openai credentials."
|
||
|
|
)
|
||
|
|
raise ValueError(msg) from e
|
||
|
|
return evaluator_cls.from_llm(llm=llm, **kwargs)
|
||
|
|
return evaluator_cls(**kwargs)
|
||
|
|
|
||
|
|
|
||
|
|
def load_evaluators(
|
||
|
|
evaluators: Sequence[EvaluatorType],
|
||
|
|
*,
|
||
|
|
llm: BaseLanguageModel | None = None,
|
||
|
|
config: dict | None = None,
|
||
|
|
**kwargs: Any,
|
||
|
|
) -> list[Chain | StringEvaluator]:
|
||
|
|
"""Load evaluators specified by a list of evaluator types.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
evaluators : Sequence[EvaluatorType]
|
||
|
|
The list of evaluator types to load.
|
||
|
|
llm : BaseLanguageModel, optional
|
||
|
|
The language model to use for evaluation, if none is provided, a default
|
||
|
|
ChatOpenAI gpt-4 model will be used.
|
||
|
|
config : dict, optional
|
||
|
|
A dictionary mapping evaluator types to additional keyword arguments,
|
||
|
|
by default None
|
||
|
|
**kwargs : Any
|
||
|
|
Additional keyword arguments to pass to all evaluators.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
-------
|
||
|
|
List[Chain]
|
||
|
|
The loaded evaluators.
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
--------
|
||
|
|
>>> from langchain_classic.evaluation import load_evaluators, EvaluatorType
|
||
|
|
>>> evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA]
|
||
|
|
>>> loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
|
||
|
|
"""
|
||
|
|
loaded = []
|
||
|
|
for evaluator in evaluators:
|
||
|
|
_kwargs = config.get(evaluator, {}) if config else {}
|
||
|
|
loaded.append(load_evaluator(evaluator, llm=llm, **{**kwargs, **_kwargs}))
|
||
|
|
return loaded
|