From 279c7e1e9e3e1db3caec36f2b5e57ad4a5e9fb84 Mon Sep 17 00:00:00 2001 From: John Huang Date: Thu, 7 May 2026 00:48:54 -0700 Subject: [PATCH 1/5] fix typing using autoevals scorers --- py/pyproject.toml | 1 + py/src/braintrust/framework.py | 6 +++--- py/src/braintrust/score.py | 23 +++++++++++++++++++---- py/uv.lock | 5 ++++- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/py/pyproject.toml b/py/pyproject.toml index 413e7f88..b3434c73 100644 --- a/py/pyproject.toml +++ b/py/pyproject.toml @@ -177,6 +177,7 @@ test-cli = [ test-types = [ {include-group = "test"}, + "autoevals==0.2.0", "pyright==1.1.408", "mypy==1.20.0", ] diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index 5367dd40..b1544010 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -48,7 +48,7 @@ validate_parameters, ) from .resource_manager import ResourceManager -from .score import Classification, ClassificationItem, Score, is_classification, is_score, is_scorer +from .score import Classification, ClassificationItem, Score, ScoreLike, is_classification, is_score, is_scorer from .serializable_data_class import SerializableDataClass from .span_types import SpanTypeAttribute from .types._eval import EvalCaseDict, EvalCaseDictNoOutput, ExperimentDatasetEvent @@ -216,7 +216,7 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output, Expected]): metadata: Metadata | None = None -OneOrMoreScores = float | int | bool | None | Score | list[Score] +OneOrMoreScores = float | int | bool | None | ScoreLike | list[Score] OneOrMoreClassifications = None | Classification | Mapping[str, Any] | list[Classification | Mapping[str, Any]] @@ -1286,7 +1286,7 @@ def _classifier_name(classifier, classifier_idx): return _callable_name(classifier, classifier_idx, "classifier") -def _build_span_metadata(results: list[Score] | list[Classification]) -> Metadata | None: +def _build_span_metadata(results: list[ScoreLike] | list[Classification]) -> Metadata | None: if not results: return None if len(results) == 1: diff --git a/py/src/braintrust/score.py b/py/src/braintrust/score.py index ca500984..43515669 100644 --- a/py/src/braintrust/score.py +++ b/py/src/braintrust/score.py @@ -2,9 +2,10 @@ import inspect import warnings from abc import ABC, abstractmethod -from typing import Any, TypedDict +from collections.abc import Mapping +from typing import Any, Protocol, TypedDict -from typing_extensions import NotRequired +from typing_extensions import NotRequired, TypeGuard from .serializable_data_class import SerializableDataClass from .types import Metadata @@ -53,6 +54,19 @@ def __post_init__(self): ) +class ScoreLike(Protocol): + @property + def name(self) -> str: ... + + @property + def score(self) -> float | None: ... + + @property + def metadata(self) -> Metadata: ... + + def as_dict(self) -> Mapping[str, Any]: ... + + class ClassificationItem(TypedDict): id: str label: NotRequired[str] @@ -76,7 +90,7 @@ class Classification(SerializableDataClass): """Optional metadata attached to the classification result.""" def as_dict(self): - result = {"id": self.id} + result: Mapping[str, Any] = {"id": self.id} if self.name is not None: result["name"] = self.name if self.label is not None: @@ -102,7 +116,7 @@ def __post_init__(self): raise ValueError("classification label must be a string when provided") -def is_score(obj): +def is_score(obj: object) -> TypeGuard[ScoreLike]: return hasattr(obj, "name") and hasattr(obj, "score") and hasattr(obj, "metadata") and hasattr(obj, "as_dict") @@ -151,6 +165,7 @@ def is_scorer(obj): "Classification", "ClassificationItem", "Score", + "ScoreLike", "Scorer", "is_classification", "is_score", diff --git a/py/uv.lock b/py/uv.lock index 16fd63be..0da4c9c2 100644 --- a/py/uv.lock +++ b/py/uv.lock @@ -471,7 +471,8 @@ version = "0.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "chevron" }, - { name = "jsonschema", version = "4.26.0", source = { registry = "https://pypi.org/simple" } }, + { name = "jsonschema", version = "4.23.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-10-braintrust-test-crewai' or (extra == 'group-10-braintrust-lint' and extra == 'group-10-braintrust-test-agentscope') or (extra == 'group-10-braintrust-lint' and extra == 'group-10-braintrust-test-agno') or (extra == 'group-10-braintrust-test-agentscope' and extra == 'group-10-braintrust-test-agno') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-agno' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-langchain' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-langchain' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-langchain' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-litellm' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-litellm' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-openai-agents' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-pydantic-ai-logfire') or (extra == 'group-10-braintrust-lint' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-strands') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-agentscope' and extra != 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-strands')" }, + { name = "jsonschema", version = "4.26.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-10-braintrust-lint' or extra == 'group-10-braintrust-test-agentscope' or extra == 'group-10-braintrust-test-agno' or extra != 'group-10-braintrust-test-crewai' or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-langchain') or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-litellm') or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-openai-agents') or (extra == 'group-10-braintrust-test-crewai' and extra == 'group-10-braintrust-test-strands')" }, { name = "polyleven" }, { name = "pyyaml" }, ] @@ -882,6 +883,7 @@ test-strands = [ { name = "pytest-vcr" }, ] test-types = [ + { name = "autoevals" }, { name = "mypy" }, { name = "pyright" }, { name = "pytest" }, @@ -1055,6 +1057,7 @@ test-strands = [ { name = "pytest-vcr", specifier = "==1.0.2" }, ] test-types = [ + { name = "autoevals", specifier = "==0.2.0" }, { name = "mypy", specifier = "==1.20.0" }, { name = "pyright", specifier = "==1.1.408" }, { name = "pytest", specifier = "==9.0.2" }, From 7fb2e5e49ba86b611e827500a3f9d816586e4284 Mon Sep 17 00:00:00 2001 From: John Huang Date: Thu, 7 May 2026 10:04:48 -0700 Subject: [PATCH 2/5] type test --- .../type_tests/test_autoevals_scorers.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 py/src/braintrust/type_tests/test_autoevals_scorers.py diff --git a/py/src/braintrust/type_tests/test_autoevals_scorers.py b/py/src/braintrust/type_tests/test_autoevals_scorers.py new file mode 100644 index 00000000..51c04c72 --- /dev/null +++ b/py/src/braintrust/type_tests/test_autoevals_scorers.py @@ -0,0 +1,41 @@ +"""Type-check and runtime tests for autoevals scorers in Eval.""" + +import pytest +from autoevals import Levenshtein # type: ignore[import-untyped] +from braintrust.framework import EvalAsync, EvalCase, EvalScorer + + +def accepts_autoevals_scorer( + scorer: EvalScorer[str, str, str], +) -> EvalScorer[str, str, str]: + return scorer + + +def autoevals_data(): + return iter([EvalCase(input="query", expected="hello world")]) + + +async def autoevals_task(input: str) -> str: + return "hello world" + + +autoevals_scores: list[EvalScorer[str, str, str]] = [ + accepts_autoevals_scorer(Levenshtein()), + accepts_autoevals_scorer(Levenshtein), + accepts_autoevals_scorer(Levenshtein.partial(hehe="hoho")), +] + + +@pytest.mark.asyncio +async def test_eval_accepts_autoevals_scorers(): + result = await EvalAsync( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 From 4421784462026d396f4aa59b5caf7258a223dc40 Mon Sep 17 00:00:00 2001 From: John Huang Date: Thu, 7 May 2026 10:06:14 -0700 Subject: [PATCH 3/5] oops --- py/src/braintrust/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py index b1544010..41b5b011 100644 --- a/py/src/braintrust/framework.py +++ b/py/src/braintrust/framework.py @@ -216,7 +216,7 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output, Expected]): metadata: Metadata | None = None -OneOrMoreScores = float | int | bool | None | ScoreLike | list[Score] +OneOrMoreScores = float | int | bool | None | ScoreLike | list[ScoreLike] OneOrMoreClassifications = None | Classification | Mapping[str, Any] | list[Classification | Mapping[str, Any]] From 5516ace05a520a9cc559d4c1fe378fda5c4e054e Mon Sep 17 00:00:00 2001 From: John Huang Date: Thu, 7 May 2026 10:12:50 -0700 Subject: [PATCH 4/5] test --- .../type_tests/test_autoevals_scorers.py | 92 ++++++++++++++++++- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/py/src/braintrust/type_tests/test_autoevals_scorers.py b/py/src/braintrust/type_tests/test_autoevals_scorers.py index 51c04c72..6723a43c 100644 --- a/py/src/braintrust/type_tests/test_autoevals_scorers.py +++ b/py/src/braintrust/type_tests/test_autoevals_scorers.py @@ -2,7 +2,7 @@ import pytest from autoevals import Levenshtein # type: ignore[import-untyped] -from braintrust.framework import EvalAsync, EvalCase, EvalScorer +from braintrust.framework import Eval, EvalAsync, EvalCase, EvalScorer def accepts_autoevals_scorer( @@ -22,12 +22,79 @@ async def autoevals_task(input: str) -> str: autoevals_scores: list[EvalScorer[str, str, str]] = [ accepts_autoevals_scorer(Levenshtein()), accepts_autoevals_scorer(Levenshtein), - accepts_autoevals_scorer(Levenshtein.partial(hehe="hoho")), + accepts_autoevals_scorer(Levenshtein.partial(foo="bar")), ] +autoevals_scores_untyped = [ + Levenshtein(), + Levenshtein, + Levenshtein.partial(foo="bar"), +] + + +async def test_eval_accepts_autoevals_scorers_typed(): + result = Eval( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +async def test_eval_accepts_autoevals_scorers_untyped(): + result = Eval( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +async def test_eval_accepts_autoevals_scorers_inline(): + result = Eval( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=[ + Levenshtein(), + Levenshtein, + Levenshtein.partial(foo="bar"), + ], + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + + +@pytest.mark.asyncio +async def test_eval_async_accepts_autoevals_scorers_typed(): + result = await EvalAsync( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=autoevals_scores, + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 + @pytest.mark.asyncio -async def test_eval_accepts_autoevals_scorers(): +async def test_eval_async_accepts_autoevals_scorers_untyped(): result = await EvalAsync( "test-autoevals-scorers", data=autoevals_data, @@ -39,3 +106,22 @@ async def test_eval_accepts_autoevals_scorers(): score = result.results[0].scores["Levenshtein"] assert score is not None assert score > 0 + + +@pytest.mark.asyncio +async def test_eval_async_accepts_autoevals_scorers_inline(): + result = await EvalAsync( + "test-autoevals-scorers", + data=autoevals_data, + task=autoevals_task, + scores=[ + Levenshtein(), + Levenshtein, + Levenshtein.partial(foo="bar"), + ], + no_send_logs=True, + ) + + score = result.results[0].scores["Levenshtein"] + assert score is not None + assert score > 0 From 04680765c2e99f3a760b52f06978a448680e315e Mon Sep 17 00:00:00 2001 From: John Huang Date: Fri, 8 May 2026 08:51:26 -0700 Subject: [PATCH 5/5] fix async --- .../type_tests/test_autoevals_scorers.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/py/src/braintrust/type_tests/test_autoevals_scorers.py b/py/src/braintrust/type_tests/test_autoevals_scorers.py index 6723a43c..9898a4ff 100644 --- a/py/src/braintrust/type_tests/test_autoevals_scorers.py +++ b/py/src/braintrust/type_tests/test_autoevals_scorers.py @@ -2,7 +2,7 @@ import pytest from autoevals import Levenshtein # type: ignore[import-untyped] -from braintrust.framework import Eval, EvalAsync, EvalCase, EvalScorer +from braintrust.framework import Eval, EvalCase, EvalScorer def accepts_autoevals_scorer( @@ -15,7 +15,11 @@ def autoevals_data(): return iter([EvalCase(input="query", expected="hello world")]) -async def autoevals_task(input: str) -> str: +def autoevals_task(input: str) -> str: + return "hello world" + + +async def autoevals_task_async(input: str) -> str: return "hello world" @@ -32,7 +36,7 @@ async def autoevals_task(input: str) -> str: ] -async def test_eval_accepts_autoevals_scorers_typed(): +def test_eval_accepts_autoevals_scorers_typed(): result = Eval( "test-autoevals-scorers", data=autoevals_data, @@ -46,7 +50,7 @@ async def test_eval_accepts_autoevals_scorers_typed(): assert score > 0 -async def test_eval_accepts_autoevals_scorers_untyped(): +def test_eval_accepts_autoevals_scorers_untyped(): result = Eval( "test-autoevals-scorers", data=autoevals_data, @@ -60,7 +64,7 @@ async def test_eval_accepts_autoevals_scorers_untyped(): assert score > 0 -async def test_eval_accepts_autoevals_scorers_inline(): +def test_eval_accepts_autoevals_scorers_inline(): result = Eval( "test-autoevals-scorers", data=autoevals_data, @@ -83,7 +87,7 @@ async def test_eval_async_accepts_autoevals_scorers_typed(): result = await EvalAsync( "test-autoevals-scorers", data=autoevals_data, - task=autoevals_task, + task=autoevals_task_async, scores=autoevals_scores, no_send_logs=True, ) @@ -98,7 +102,7 @@ async def test_eval_async_accepts_autoevals_scorers_untyped(): result = await EvalAsync( "test-autoevals-scorers", data=autoevals_data, - task=autoevals_task, + task=autoevals_task_async, scores=autoevals_scores, no_send_logs=True, ) @@ -113,7 +117,7 @@ async def test_eval_async_accepts_autoevals_scorers_inline(): result = await EvalAsync( "test-autoevals-scorers", data=autoevals_data, - task=autoevals_task, + task=autoevals_task_async, scores=[ Levenshtein(), Levenshtein,