From e058e4ddc48840065ff2199d319bc6246acfbf24 Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Wed, 22 Apr 2026 10:43:34 -0700 Subject: [PATCH 1/5] Add classifier support --- lib/braintrust/classifier.rb | 157 ++++++++++ lib/braintrust/eval.rb | 35 ++- lib/braintrust/eval/context.rb | 47 ++- lib/braintrust/eval/evaluator.rb | 21 +- lib/braintrust/eval/result.rb | 6 +- lib/braintrust/eval/runner.rb | 113 +++++++- .../server/services/list_service.rb | 5 + test/braintrust/classifier_test.rb | 226 +++++++++++++++ test/braintrust/eval/evaluator_test.rb | 5 +- test/braintrust/eval/runner_test.rb | 269 ++++++++++++++++++ test/braintrust/eval_test.rb | 48 ++++ test/support/braintrust_helper.rb | 6 +- 12 files changed, 901 insertions(+), 37 deletions(-) create mode 100644 lib/braintrust/classifier.rb create mode 100644 test/braintrust/classifier_test.rb diff --git a/lib/braintrust/classifier.rb b/lib/braintrust/classifier.rb new file mode 100644 index 00000000..089de302 --- /dev/null +++ b/lib/braintrust/classifier.rb @@ -0,0 +1,157 @@ +# frozen_string_literal: true + +require_relative "internal/callable" + +module Braintrust + # Classifier wraps a classification function that categorizes and labels eval outputs. + # + # Unlike scorers (which return numeric 0-1 values), classifiers return structured + # {Classification} items with an id and optional label and metadata. + # + # Use inline with a block (keyword args): + # classifier = Classifier.new("category") { |output:| {name: "category", id: "greeting", label: "Greeting"} } + # + # Or include in a class and define #call with keyword args: + # class CategoryClassifier + # include Braintrust::Classifier + # + # def call(output:) + # {name: "category", id: "greeting", label: "Greeting"} + # end + # end + # + # Classifiers may return a single Classification hash, an Array of them, or nil + # (meaning no classifications for this case). + module Classifier + DEFAULT_NAME = "classifier" + + # @param base [Class] the class including Classifier + def self.included(base) + base.include(Callable) + end + + # Create a block-based classifier. + # + # @param name [String, nil] optional name (defaults to "classifier") + # @param block [Proc] the classification implementation; declare only the keyword + # args you need. Extra kwargs are filtered out automatically. + # + # Supported kwargs: +input:+, +expected:+, +output:+, +metadata:+, +trace:+, +parameters:+ + # @return [Classifier::Block] + # @raise [ArgumentError] if the block has unsupported arity + def self.new(name = nil, &block) + Block.new(name: name || DEFAULT_NAME, &block) + end + + # Included into classes that +include Classifier+. Prepends KeywordFilter and + # ClassificationNormalizer so #call receives only declared kwargs and always returns + # Array. Also provides a default #name and #call_parameters. + module Callable + # Normalizes the raw return value of #call into Array. + # Nested inside Callable because it depends on #name which Callable provides. + module ClassificationNormalizer + # @return [Array] normalized classification hashes with :name, :id, and optional :label, :metadata keys + def call(**kwargs) + normalize_classification_result(super) + end + + private + + # @param result [Hash, Array, nil] raw return value from #call + # @return [Array] zero or more classification hashes with :name, :id keys + # @raise [ArgumentError] if any item is not a non-empty object + def normalize_classification_result(result) + case result + when nil then [] + when Array then result.map { |item| normalize_classification_item(item) } + when Hash then [normalize_classification_item(result)] + else + raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{result.inspect}" + end + end + + # Fills in missing :name from the classifier, validates :id. + # @param item [Hash] a classification hash + # @return [Hash] the item with :name defaulted and validated + # @raise [ArgumentError] if item is not a non-empty Hash + def normalize_classification_item(item) + unless item.is_a?(Hash) && !item.empty? + raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{item.inspect}" + end + + # :name defaults to the classifier's resolved name when missing, empty, or non-string + unless item[:name].is_a?(String) && !item[:name].empty? + item = item.merge(name: name) + end + + item + end + end + + # Infrastructure modules prepended onto every classifier class. + # Used both to set up the ancestor chain and to skip past them in + # #call_parameters so KeywordFilter sees the real call signature. + PREPENDED = [Internal::Callable::KeywordFilter, ClassificationNormalizer].freeze + + # @param base [Class] the class including Callable + def self.included(base) + PREPENDED.each { |mod| base.prepend(mod) } + end + + # Default name derived from the class name (e.g. CategoryClassifier -> "category_classifier"). + # @return [String] + def name + klass = self.class.name&.split("::")&.last + return Classifier::DEFAULT_NAME unless klass + klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase + end + + # Provides KeywordFilter with the actual call signature of the subclass. + # Walks past PREPENDED modules in the ancestor chain so that user-defined + # #call keyword params are correctly introspected. + # Block overrides this to point directly at @block.parameters. + # @return [Array] parameter list + def call_parameters + meth = method(:call) + meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner) + meth.parameters + end + end + + # Block-based classifier. Stores a Proc and delegates #call to it. + # Includes Classifier so it satisfies +Classifier ===+ checks. + # Exposes #call_parameters so KeywordFilter can introspect the block's + # declared kwargs rather than Block#call's **kwargs signature. + class Block + include Classifier + + # @return [String] + attr_reader :name + + # @param name [String] classifier name + # @param block [Proc] classification implementation; must use keyword args or zero-arity + # @raise [ArgumentError] if the block uses positional params + def initialize(name: DEFAULT_NAME, &block) + @name = name + params = block.parameters + unless Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0 + raise ArgumentError, "Classifier block must use keyword args (got arity #{block.arity})" + end + @block = block + end + + # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter) + # @return [Array] normalized classification results + def call(**kwargs) + @block.call(**kwargs) + end + + # Exposes the block's parameter list so KeywordFilter can filter + # kwargs to match the block's declared keywords. + # @return [Array] parameter list from Proc#parameters + def call_parameters + @block.parameters + end + end + end +end diff --git a/lib/braintrust/eval.rb b/lib/braintrust/eval.rb index d661eedc..ac27b48b 100644 --- a/lib/braintrust/eval.rb +++ b/lib/braintrust/eval.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require_relative "classifier" require_relative "scorer" require_relative "task" require_relative "functions" @@ -160,7 +161,10 @@ def scorer(name, callable = nil, &block) # - String: dataset name (fetches from same project) # - Hash: {name:, id:, project:, version:, limit:} # @param task [#call] The task to evaluate (must be callable) - # @param scorers [Array] The scorers to use (String names, Scorer objects, or callables) + # @param scorers [Array, nil] The scorers to use (String names, Scorer objects, or callables). + # At least one of scorers or classifiers must be provided. + # @param classifiers [Array, nil] The classifiers to use. + # At least one of scorers or classifiers must be provided. # @param on_progress [#call, nil] Optional callback fired after each test case. # Receives a Hash: {"data" => output, "scores" => {name => value}} on success, # or {"error" => message} on failure. @@ -177,13 +181,16 @@ def scorer(name, callable = nil, &block) # @param parent [Hash, nil] Parent span context ({object_type:, object_id:, generation:}) # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument # @return [Result] - def run(task:, scorers:, project: nil, experiment: nil, - cases: nil, dataset: nil, on_progress: nil, + def run(task:, scorers: nil, classifiers: nil, project: nil, + experiment: nil, cases: nil, dataset: nil, on_progress: nil, parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false, state: nil, tracer_provider: nil, project_id: nil, parent: nil, parameters: nil) # Validate required parameters - validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset) + validate_params!(task: task, scorers: scorers, + classifiers: classifiers, cases: cases, dataset: dataset) + scorers ||= [] + classifiers ||= [] experiment_id = nil project_name = project @@ -216,6 +223,7 @@ def run(task:, scorers:, project: nil, experiment: nil, context = Context.build( task: task, scorers: scorers, + classifiers: classifiers, cases: cases, experiment_id: experiment_id, experiment_name: experiment, @@ -245,9 +253,19 @@ def print_result(result) # Validate required parameters # @raise [ArgumentError] if validation fails - def validate_params!(task:, scorers:, cases:, dataset:) + def validate_params!(task:, scorers:, classifiers:, cases:, dataset:) raise ArgumentError, "task is required" unless task - raise ArgumentError, "scorers is required" unless scorers + + # Validate task is callable before anything else + unless task.respond_to?(:call) + raise ArgumentError, "task must be callable (respond to :call)" + end + + has_scorers = scorers && !scorers.empty? + has_classifiers = classifiers && !classifiers.empty? + unless has_scorers || has_classifiers + raise ArgumentError, "at least one of scorers or classifiers is required" + end # Validate cases and dataset are mutually exclusive if cases && dataset @@ -258,11 +276,6 @@ def validate_params!(task:, scorers:, cases:, dataset:) unless cases || dataset raise ArgumentError, "must specify either 'cases' or 'dataset'" end - - # Validate task is callable - unless task.respond_to?(:call) - raise ArgumentError, "task must be callable (respond to :call)" - end end # Resolve project by name or ID. Creates if needed. diff --git a/lib/braintrust/eval/context.rb b/lib/braintrust/eval/context.rb index ebcca050..02717267 100644 --- a/lib/braintrust/eval/context.rb +++ b/lib/braintrust/eval/context.rb @@ -1,18 +1,20 @@ # frozen_string_literal: true require_relative "cases" +require_relative "../classifier" module Braintrust module Eval # Holds all normalized, ready-to-execute eval components. # Use Context.build to construct from raw user inputs. class Context - attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name, - :project_id, :project_name, :state, :tracer_provider, + attr_reader :task, :scorers, :classifiers, :cases, :experiment_id, + :experiment_name, :project_id, :project_name, :state, :tracer_provider, :on_progress, :parent_span_attr, :generation, :parameters # @param task [Task] Normalized task wrapper # @param scorers [Array] Normalized scorer wrappers + # @param classifiers [Array] Normalized classifier wrappers # @param cases [Cases] Normalized eval cases # @param experiment_id [String, nil] Experiment ID for logging and trace linkage # @param experiment_name [String, nil] Experiment name, included in span attributes @@ -24,11 +26,13 @@ class Context # @param parent_span_attr [String, nil] Formatted parent span identifier ("type:id"), linking spans to a parent context # @param generation [Integer, nil] Generation number from the parent span context, used to link spans in a trace hierarchy # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument - def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, - project_id: nil, project_name: nil, state: nil, tracer_provider: nil, - on_progress: nil, parent_span_attr: nil, generation: nil, parameters: nil) + def initialize(task:, scorers:, cases:, classifiers: [], + experiment_id: nil, experiment_name: nil, project_id: nil, + project_name: nil, state: nil, tracer_provider: nil, on_progress: nil, + parent_span_attr: nil, generation: nil, parameters: nil) @task = task @scorers = scorers + @classifiers = classifiers @cases = cases @experiment_id = experiment_id @experiment_name = experiment_name @@ -46,6 +50,7 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil # Delegates to Factory for normalization. # @param task [Task, Proc, #call] Task to evaluate; wrapped into a {Task} if needed # @param scorers [Array] Scorers; each is normalized into a {Scorer} + # @param classifiers [Array] Classifiers; each is normalized into a {Classifier} # @param cases [Cases, Array, Enumerable] Eval cases; wrapped into {Cases} if needed # @param experiment_id [String, nil] Experiment ID for logging # @param experiment_name [String, nil] Experiment name, included in span attributes @@ -57,14 +62,15 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument # @return [Context] - def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, - project_id: nil, project_name: nil, state: nil, tracer_provider: nil, - on_progress: nil, parent: nil, parameters: nil) + def self.build(task:, scorers:, cases:, classifiers: [], + experiment_id: nil, experiment_name: nil, project_id: nil, + project_name: nil, state: nil, tracer_provider: nil, on_progress: nil, + parent: nil, parameters: nil) Factory.new( state: state, tracer_provider: tracer_provider, project_id: project_id, project_name: project_name ).build( - task: task, scorers: scorers, cases: cases, + task: task, scorers: scorers, classifiers: classifiers, cases: cases, experiment_id: experiment_id, experiment_name: experiment_name, on_progress: on_progress, parent: parent, parameters: parameters ) @@ -86,17 +92,19 @@ def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: # Normalize raw inputs and construct a {Context}. # @param task [Task, Proc, #call] Raw task # @param scorers [Array] Raw scorers + # @param classifiers [Array] Raw classifiers # @param cases [Cases, Array, Enumerable] Raw eval cases # @param experiment_id [String, nil] # @param experiment_name [String, nil] # @param on_progress [Proc, nil] # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation # @return [Context] - def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, - on_progress: nil, parent: nil, parameters: nil) + def build(task:, scorers:, cases:, classifiers: [], experiment_id: nil, + experiment_name: nil, on_progress: nil, parent: nil, parameters: nil) Context.new( task: normalize_task(task), scorers: normalize_scorers(scorers), + classifiers: normalize_classifiers(classifiers), cases: normalize_cases(cases), experiment_id: experiment_id, experiment_name: experiment_name, @@ -188,6 +196,23 @@ def normalize_scorers(raw) end end end + + # @param raw [Array] + # @return [Array] + def normalize_classifiers(raw) + raw.map do |classifier| + case classifier + when Braintrust::Classifier + classifier + when Proc + # Pass Proc/Lambda directly to preserve keyword arg info + Braintrust::Classifier.new(&classifier) + else + name = classifier.respond_to?(:name) ? classifier.name : nil + Braintrust::Classifier.new(name, &classifier.method(:call)) + end + end + end end end end diff --git a/lib/braintrust/eval/evaluator.rb b/lib/braintrust/eval/evaluator.rb index a5f135bc..6798f31a 100644 --- a/lib/braintrust/eval/evaluator.rb +++ b/lib/braintrust/eval/evaluator.rb @@ -40,11 +40,12 @@ module Eval # } # ) class Evaluator - attr_accessor :task, :scorers, :parameters + attr_accessor :task, :scorers, :classifiers, :parameters - def initialize(task: nil, scorers: [], parameters: {}) + def initialize(task: nil, scorers: [], classifiers: [], parameters: {}) @task = task @scorers = scorers + @classifiers = classifiers @parameters = parameters end @@ -68,6 +69,7 @@ def validate! # @param project_id [String, nil] Project UUID (skips project creation) # @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch # @param scorers [Array, nil] Additional scorers (merged with evaluator's own) + # @param classifiers [Array, nil] Additional classifiers (merged with evaluator's own) # @param parent [Hash, nil] Parent span context # @param state [State, nil] Braintrust state # @param update [Boolean] If true, allow reusing existing experiment (default: false) @@ -75,16 +77,19 @@ def validate! # @return [Result] def run(cases, on_progress: nil, quiet: false, project: nil, experiment: nil, project_id: nil, - dataset: nil, scorers: nil, parent: nil, + dataset: nil, scorers: nil, classifiers: nil, parent: nil, state: nil, update: false, tracer_provider: nil, parameters: nil) all_scorers = scorers ? self.scorers + scorers : self.scorers + all_classifiers = classifiers ? + self.classifiers + classifiers : + self.classifiers Braintrust::Eval.run( - task: task, scorers: all_scorers, cases: cases, dataset: dataset, - project: project, experiment: experiment, project_id: project_id, - parent: parent, on_progress: on_progress, quiet: quiet, - state: state, update: update, tracer_provider: tracer_provider, - parameters: parameters + task: task, scorers: all_scorers, classifiers: all_classifiers, + cases: cases, dataset: dataset, project: project, + experiment: experiment, project_id: project_id, parent: parent, + on_progress: on_progress, quiet: quiet, state: state, update: update, + tracer_provider: tracer_provider, parameters: parameters ) end end diff --git a/lib/braintrust/eval/result.rb b/lib/braintrust/eval/result.rb index c18af302..7af7132f 100644 --- a/lib/braintrust/eval/result.rb +++ b/lib/braintrust/eval/result.rb @@ -9,7 +9,7 @@ module Eval # Contains experiment metadata, errors, timing information, and raw score data class Result attr_reader :experiment_id, :experiment_name, :project_id, :project_name, - :permalink, :errors, :duration, :scores + :permalink, :errors, :duration, :scores, :classifications # Create a new result # @param experiment_id [String] The experiment ID @@ -20,8 +20,9 @@ class Result # @param errors [Array] List of errors that occurred # @param duration [Float] Duration in seconds # @param scores [Hash, nil] Raw score data { scorer_name => Array } + # @param classifications [Hash, nil] Classification results { name => Array }, nil when no classifiers ran def initialize(experiment_id:, experiment_name:, project_id:, project_name:, - permalink:, errors:, duration:, scores: nil) + permalink:, errors:, duration:, scores: nil, classifications: nil) @experiment_id = experiment_id @experiment_name = experiment_name @project_id = project_id @@ -30,6 +31,7 @@ def initialize(experiment_id:, experiment_name:, project_id:, project_name:, @errors = errors @duration = duration @scores = scores + @classifications = classifications end # Check if the evaluation was successful (no errors) diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index f461e041..0cd249d8 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -27,8 +27,9 @@ def initialize(eval_context) @eval_context = eval_context @tracer = eval_context.tracer_provider.tracer("braintrust-eval") - # Mutex for thread-safe score collection + # Mutexes for thread-safe result collection @score_mutex = Mutex.new + @classification_mutex = Mutex.new end # Run evaluation and return Result @@ -39,6 +40,7 @@ def run(parallelism: 1) eval_cases = eval_context.cases errors = Queue.new @scores = {} # Reset for each run: { scorer_name => Array } + @classifications = {} # Reset for each run: { classifier_name => Array } if parallelism && parallelism > 1 Internal::ThreadPool.each(eval_cases, parallelism: parallelism) do |eval_case| @@ -69,7 +71,8 @@ def run(parallelism: 1) permalink: permalink, errors: error_array, duration: duration, - scores: @scores + scores: @scores, + classifications: @classifications.empty? ? nil : @classifications ) end @@ -119,6 +122,14 @@ def run_eval_case(kase, errors) errors << "Scorers failed for input '#{kase.input}': #{e.message}" end + # Run classifiers (independent of scorers; errors do not abort eval) + classifier_errors = run_classifiers(kase, eval_span) + unless classifier_errors.empty? + existing_metadata = kase.metadata || {} + classifier_errors_metadata = existing_metadata.merge(classifier_errors: classifier_errors) + set_json_attr(eval_span, "braintrust.metadata", classifier_errors_metadata) + end + # Set output after task completes set_json_attr(eval_span, "braintrust.output_json", {output: kase.output}) @@ -318,6 +329,104 @@ def collect_scores(score_results) score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] } end end + + # Run all classifiers for a case. Classifier errors are non-fatal and stored in metadata. + # @param kase [CaseContext] The per-case context (output must be populated) + # @param eval_span [OpenTelemetry::Trace::Span] The eval span for this case + # @return [Hash] classifier_errors map (name -> error message), empty if no errors + def run_classifiers(kase, eval_span) + return {} if eval_context.classifiers.empty? + + classifier_kwargs = { + input: kase.input, + expected: kase.expected, + output: kase.output, + metadata: kase.metadata || {}, + trace: kase.trace, + parameters: eval_context.parameters || {} + } + classifier_input = { + input: kase.input, + expected: kase.expected, + output: kase.output, + metadata: kase.metadata || {}, + parameters: eval_context.parameters || {} + } + + case_classifications = {} + classifier_errors = {} + + eval_context.classifiers.each_with_index do |classifier, index| + classifier_name = classifier.name || "classifier_#{index}" + begin + results = run_classifier(classifier, classifier_kwargs, classifier_input) + results.each do |item| + item_name = item[:name] + classification_item = item.except(:name) + (case_classifications[item_name] ||= []) << classification_item + end + collect_classifications(results) + rescue => e + Braintrust::Log.warn("[Classifier] #{classifier_name} failed: #{e.message}") + classifier_errors[classifier_name] = e.message + end + end + + unless case_classifications.empty? + set_json_attr(eval_span, "braintrust.classifications", case_classifications) + end + + classifier_errors + end + + # Run a single classifier inside its own span. + # @param classifier [Classifier] The classifier to run + # @param classifier_kwargs [Hash] Keyword arguments for the classifier + # @param classifier_input [Hash] Input to log on the span + # @return [Array] Normalized classification results from the classifier + def run_classifier(classifier, classifier_kwargs, classifier_input) + tracer.in_span(classifier.name) do |classifier_span| + classifier_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr + set_json_attr(classifier_span, "braintrust.span_attributes", build_classifier_span_attributes(classifier.name)) + set_json_attr(classifier_span, "braintrust.input_json", classifier_input) + + classification_results = classifier.call(**classifier_kwargs) + + # Build output dict keyed by name -> array of items (for span logging) + output_by_name = {} + classification_results.each do |item| + (output_by_name[item[:name]] ||= []) << item.except(:name) + end + + set_json_attr(classifier_span, "braintrust.output_json", output_by_name) + + classification_results + rescue => e + record_span_error(classifier_span, e, "ClassifierError") + raise + end + end + + # Build span_attributes for a classifier span. + # @param classifier_name [String] The classifier name + # @return [Hash] + def build_classifier_span_attributes(classifier_name) + attrs = {type: "classifier", name: classifier_name, purpose: "scorer"} + attrs[:generation] = eval_context.generation if eval_context.generation + attrs + end + + # Collect classification results into the global accumulator (thread-safe). + # Converts Classification to ClassificationItem by dropping :name. + # @param classification_results [Array] Classification results from a classifier + def collect_classifications(classification_results) + @classification_mutex.synchronize do + classification_results.each do |item| + item_name = item[:name] + (@classifications[item_name] ||= []) << item.except(:name) + end + end + end end end end diff --git a/lib/braintrust/server/services/list_service.rb b/lib/braintrust/server/services/list_service.rb index 06bd7add..8c29c6d3 100644 --- a/lib/braintrust/server/services/list_service.rb +++ b/lib/braintrust/server/services/list_service.rb @@ -20,6 +20,11 @@ def call {"name" => scorer_name} end entry = {"scores" => scores} + classifiers = (evaluator.classifiers || []).each_with_index.map do |classifier, i| + classifier_name = classifier.respond_to?(:name) ? classifier.name : "classifier_#{i}" + {"name" => classifier_name} + end + entry["classifiers"] = classifiers unless classifiers.empty? params = serialize_parameters(evaluator.parameters) entry["parameters"] = params if params result[name] = entry diff --git a/test/braintrust/classifier_test.rb b/test/braintrust/classifier_test.rb new file mode 100644 index 00000000..8dfe4501 --- /dev/null +++ b/test/braintrust/classifier_test.rb @@ -0,0 +1,226 @@ +# frozen_string_literal: true + +require "test_helper" +require "braintrust/classifier" + +class Braintrust::ClassifierTest < Minitest::Test + # ============================================ + # Classifier.new with block (inline classifiers) + # ============================================ + + def test_classifier_with_kwargs_block + classifier = Braintrust::Classifier.new("category") do |output:, **| + {name: "category", id: "greeting", label: "Greeting"} + end + + assert_equal "category", classifier.name + result = classifier.call(input: "hello", expected: nil, output: "hello") + assert_equal [{name: "category", id: "greeting", label: "Greeting"}], result + end + + def test_classifier_with_subset_kwargs_filters_extra_keys + classifier = Braintrust::Classifier.new("category") do |output:| + {name: "category", id: "word"} + end + + result = classifier.call(input: "x", expected: nil, output: "hello", metadata: {}, tags: ["t"]) + assert_equal [{name: "category", id: "word"}], result + end + + def test_classifier_returns_nil_produces_empty_array + classifier = Braintrust::Classifier.new("maybe") { |**| nil } + assert_equal [], classifier.call(output: "hello") + end + + def test_classifier_returns_array_of_classifications + classifier = Braintrust::Classifier.new("sentiment") do |**| + [ + {name: "sentiment", id: "positive", label: "Positive"}, + {name: "sentiment", id: "enthusiastic", label: "Enthusiastic"} + ] + end + + result = classifier.call(output: "great!") + assert_equal 2, result.length + assert_equal({name: "sentiment", id: "positive", label: "Positive"}, result[0]) + assert_equal({name: "sentiment", id: "enthusiastic", label: "Enthusiastic"}, result[1]) + end + + def test_classifier_with_metadata + classifier = Braintrust::Classifier.new("category") do |**| + {name: "category", id: "greeting", label: "Greeting", metadata: {source: "unit-test"}} + end + + result = classifier.call(output: "hello") + assert_equal [{name: "category", id: "greeting", label: "Greeting", metadata: {source: "unit-test"}}], result + end + + # ============================================ + # Name defaulting + # ============================================ + + def test_name_defaults_to_classifier_function_name_when_missing + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {id: "foo"} # no :name key + } + + result = classifier.call(output: "x") + assert_equal "my_classifier", result[0][:name] + end + + def test_name_defaults_to_classifier_function_name_when_empty_string + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {name: "", id: "foo"} + } + + result = classifier.call(output: "x") + assert_equal "my_classifier", result[0][:name] + end + + def test_name_defaults_to_classifier_function_name_when_not_a_string + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {name: 42, id: "foo"} + } + + result = classifier.call(output: "x") + assert_equal "my_classifier", result[0][:name] + end + + def test_explicit_name_in_result_takes_precedence + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {name: "override_name", id: "foo"} + } + + result = classifier.call(output: "x") + assert_equal "override_name", result[0][:name] + end + + # ============================================ + # Validation + # ============================================ + + def test_classifier_non_empty_object_validation_nil_item + classifier = Braintrust::Classifier.new("bad") { |**| [nil] } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + assert_match(/nil/, error.message) + end + + def test_classifier_non_empty_object_validation_empty_hash + classifier = Braintrust::Classifier.new("bad") { |**| {} } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + end + + def test_classifier_non_empty_object_validation_string_item + classifier = Braintrust::Classifier.new("bad") { |**| ["not-a-hash"] } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + end + + def test_classifier_non_empty_object_validation_non_hash_scalar + classifier = Braintrust::Classifier.new("bad") { |**| 42 } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + end + + def test_classifier_positional_params_raises + error = assert_raises(ArgumentError) do + Braintrust::Classifier.new("bad") { |a, b| a } + end + + assert_match(/classifier block must use keyword args/i, error.message) + end + + # ============================================ + # Name detection + # ============================================ + + def test_classifier_name_defaults_to_classifier_for_base_class + classifier = Braintrust::Classifier.new { |**| {id: "x"} } + assert_equal "classifier", classifier.name + end + + def test_classifier_explicit_name_takes_precedence + classifier = Braintrust::Classifier.new("my_name") { |**| {id: "x"} } + assert_equal "my_name", classifier.name + end + + # ============================================ + # Subclass pattern + # ============================================ + + def test_subclass_with_call_override + klass = Class.new do + include Braintrust::Classifier + + def call(output:) + {name: "category", id: output.empty? ? "empty" : "nonempty"} + end + end + + classifier = klass.new + assert_kind_of Braintrust::Classifier, classifier + + result = classifier.call(input: "x", expected: nil, output: "hello") + assert_equal [{name: "category", id: "nonempty"}], result + + result2 = classifier.call(input: "x", expected: nil, output: "") + assert_equal [{name: "category", id: "empty"}], result2 + end + + def test_subclass_with_name_override + klass = Class.new do + include Braintrust::Classifier + + def name + "custom_classifier" + end + + def call(**) + {id: "foo"} + end + end + + classifier = klass.new + assert_equal "custom_classifier", classifier.name + end + + def test_subclass_name_derived_from_class_name + klass = Class.new do + include Braintrust::Classifier + + def call(**) + {id: "foo"} + end + end + + Braintrust.stub_const(:FuzzyMatchTestClassifier, klass) do + classifier = klass.new + assert_equal "fuzzy_match_test_classifier", classifier.name + end + end + + def test_subclass_without_call_raises_on_call + klass = Class.new do + include Braintrust::Classifier + end + classifier = klass.new + + assert_raises(NoMethodError) do + classifier.call(output: "x") + end + end +end diff --git a/test/braintrust/eval/evaluator_test.rb b/test/braintrust/eval/evaluator_test.rb index e6268363..a590039b 100644 --- a/test/braintrust/eval/evaluator_test.rb +++ b/test/braintrust/eval/evaluator_test.rb @@ -77,7 +77,10 @@ def test_run_delegates_to_eval_run end def test_run_passes_on_progress - evaluator = Braintrust::Eval::Evaluator.new(task: ->(input:) { input }) + evaluator = Braintrust::Eval::Evaluator.new( + task: ->(input:) { input }, + scorers: [Braintrust::Scorer.new("noop") { 1.0 }] + ) progress_events = [] cases = [{input: "a"}, {input: "b"}] diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb index bd4b525a..28eb95a4 100644 --- a/test/braintrust/eval/runner_test.rb +++ b/test/braintrust/eval/runner_test.rb @@ -2065,3 +2065,272 @@ def test_runner_parameters_with_parallelism assert(params.all? { |p| p == {"model" => "gpt-4"} }) end end + +class Braintrust::Eval::RunnerClassifierTest < Minitest::Test + def test_runner_with_classifiers_only + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("category") { |output:| {name: "category", id: "greeting", label: "Greeting"} } + ], + cases: [{input: "hello", expected: nil}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_equal({}, result.scores) + assert_equal({"category" => [{id: "greeting", label: "Greeting"}]}, result.classifications) + end + + def test_runner_with_scorers_and_classifiers + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { |expected:, output:| (output == expected) ? 1.0 : 0.0 }], + classifiers: [ + Braintrust::Classifier.new("category") { |**| {name: "category", id: "text"} } + ], + cases: [{input: "hello", expected: "HELLO"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_equal [1.0], result.scores["exact"] + assert_equal({"category" => [{id: "text"}]}, result.classifications) + end + + def test_runner_classifier_nil_return_produces_no_classifications + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("maybe") { |**| nil } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_nil result.classifications + end + + def test_runner_classifier_error_does_not_abort_eval + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [Braintrust::Scorer.new("always_one") { 1.0 }], + classifiers: [ + Braintrust::Classifier.new("broken") { |**| raise "classifier boom" } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + # Eval succeeds — classifier errors don't add to errors queue + assert result.success? + assert_equal [1.0], result.scores["always_one"] + assert_nil result.classifications + end + + def test_runner_classifier_error_does_not_affect_other_classifiers + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("broken") { |**| raise "boom" }, + Braintrust::Classifier.new("working") { |**| {name: "working", id: "ok"} } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_equal({"working" => [{id: "ok"}]}, result.classifications) + end + + def test_runner_classifier_error_logged_to_span_metadata + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("broken") { |**| raise "classifier boom" } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + Braintrust::Eval::Runner.new(context).run + spans = rig.drain + + eval_span = spans.find { |s| s.name == "eval" } + refute_nil eval_span + metadata = JSON.parse(eval_span.attributes["braintrust.metadata"] || "{}") + assert_equal "classifier boom", metadata.dig("classifier_errors", "broken") + end + + def test_runner_classifier_span_attributes + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("my_classifier") { |**| {name: "my_classifier", id: "foo"} } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + Braintrust::Eval::Runner.new(context).run + spans = rig.drain + + classifier_span = spans.find { |s| s.name == "my_classifier" } + refute_nil classifier_span + span_attrs = JSON.parse(classifier_span.attributes["braintrust.span_attributes"]) + assert_equal "classifier", span_attrs["type"] + assert_equal "scorer", span_attrs["purpose"] + assert_equal "my_classifier", span_attrs["name"] + end + + def test_runner_classifier_multi_label_result + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("sentiment") do |**| + [ + {name: "sentiment", id: "positive", label: "Positive"}, + {name: "sentiment", id: "enthusiastic", label: "Enthusiastic"} + ] + end + ], + cases: [{input: "great!"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + sentiment = result.classifications["sentiment"] + assert_equal 2, sentiment.length + assert_equal({id: "positive", label: "Positive"}, sentiment[0]) + assert_equal({id: "enthusiastic", label: "Enthusiastic"}, sentiment[1]) + end + + def test_runner_classifier_name_defaults_to_function_name + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("my_classifier") { |**| {id: "foo"} } # no :name in result + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert result.classifications.key?("my_classifier") + end + + def test_runner_classifications_logged_to_eval_span + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("category") { |**| {name: "category", id: "greeting"} } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + Braintrust::Eval::Runner.new(context).run + spans = rig.drain + + eval_span = spans.find { |s| s.name == "eval" } + refute_nil eval_span + raw = eval_span.attributes["braintrust.classifications"] + refute_nil raw + classifications = JSON.parse(raw) + assert_equal [{"id" => "greeting"}], classifications["category"] + end + + def test_runner_classifications_nil_when_no_classifiers + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_nil result.classifications + end + + def test_runner_multiple_cases_accumulate_classifications + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("category") { |input:| {name: "category", id: (input.length > 3) ? "long" : "short"} } + ], + cases: [{input: "hi"}, {input: "hello"}, {input: "ok"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + category = result.classifications["category"] + assert_equal 3, category.length + end +end diff --git a/test/braintrust/eval_test.rb b/test/braintrust/eval_test.rb index bef2a2cf..6a11d008 100644 --- a/test/braintrust/eval_test.rb +++ b/test/braintrust/eval_test.rb @@ -1384,4 +1384,52 @@ def test_eval_run_with_parameters end end end + + # ============================================ + # Classifier validation tests + # ============================================ + + def test_eval_run_requires_at_least_scorers_or_classifiers + error = assert_raises(ArgumentError) do + Braintrust::Eval.run( + cases: [{input: "hello"}], + task: ->(input:) { input } + ) + end + assert_match(/at least one of scorers or classifiers is required/i, error.message) + end + + def test_eval_run_requires_at_least_scorers_or_classifiers_when_empty_arrays + error = assert_raises(ArgumentError) do + Braintrust::Eval.run( + cases: [{input: "hello"}], + task: ->(input:) { input }, + scorers: [], + classifiers: [] + ) + end + assert_match(/at least one of scorers or classifiers is required/i, error.message) + end + + def test_eval_run_with_classifiers_only_no_scorers + rig = setup_otel_test_rig + + result = run_test_eval( + experiment_id: "exp-123", + experiment_name: "classifier-only", + project_id: "proj-456", + project_name: "test-project", + cases: [{input: "hello"}], + task: ->(input:) { input }, + classifiers: [ + Braintrust::Classifier.new("category") { |**| {name: "category", id: "greeting"} } + ], + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + assert result.success? + assert_equal({}, result.scores) + assert_equal({"category" => [{id: "greeting"}]}, result.classifications) + end end diff --git a/test/support/braintrust_helper.rb b/test/support/braintrust_helper.rb index 49d57eb1..8f355ee1 100644 --- a/test/support/braintrust_helper.rb +++ b/test/support/braintrust_helper.rb @@ -75,11 +75,13 @@ def get_integration_test_api(**options) # Helper to run eval internally without API calls for testing # @param state [State] Braintrust state - def run_test_eval(experiment_id:, experiment_name:, project_id:, project_name:, - cases:, task:, scorers:, state:, parallelism: 1, tracer_provider: nil) + def run_test_eval(experiment_id:, experiment_name:, project_id:, + project_name:, cases:, task:, state:, scorers: [], classifiers: [], + parallelism: 1, tracer_provider: nil) context = Braintrust::Eval::Context.build( task: task, scorers: scorers, + classifiers: classifiers, cases: cases, experiment_id: experiment_id, experiment_name: experiment_name, From 4cd8c1047713c71a35fa282b9652f5f271fbd67b Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Wed, 22 Apr 2026 12:52:19 -0700 Subject: [PATCH 2/5] Add classifier example --- examples/eval/classifiers.rb | 144 +++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 examples/eval/classifiers.rb diff --git a/examples/eval/classifiers.rb b/examples/eval/classifiers.rb new file mode 100644 index 00000000..2d93d320 --- /dev/null +++ b/examples/eval/classifiers.rb @@ -0,0 +1,144 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "braintrust" +require "opentelemetry/sdk" + +# Example: Classifiers +# +# Classifiers categorize and label eval outputs. Unlike scorers (which return +# numeric 0-1 values), classifiers return structured Classification items — +# each with an :id, an optional :label, and optional :metadata. +# +# Results are stored as a dictionary keyed by classifier name: +# +# { "sentiment" => [{ id: "positive", label: "Positive" }] } +# +# Three patterns are shown: +# +# 1. Block-based (Braintrust::Classifier.new): +# Returns a single Classification hash. Good for concise, one-off classifiers. +# +# 2. Multi-label block-based: +# Returns an Array of Classification hashes — useful when a single +# classifier assigns multiple labels to the same output. +# +# 3. Class-based (include Braintrust::Classifier): +# Define a class with a #call method. Good for reusable classifiers +# that carry their own logic and state. +# +# Classifiers and scorers run independently. You can use both together, or +# use only classifiers when you don't need numeric scores. +# +# Usage: +# bundle exec ruby examples/eval/classifiers.rb + +Braintrust.init + +# --------------------------------------------------------------------------- +# Test cases: customer support messages +# --------------------------------------------------------------------------- +MESSAGES = [ + {input: "Hi! I just wanted to say thank you, the product is amazing!"}, + {input: "I've been waiting 2 weeks for my order. This is unacceptable!"}, + {input: "How do I reset my password? I can't find the option anywhere."}, + {input: "The item arrived damaged. I need a refund immediately."}, + {input: "Just checking in — any update on my ticket #4821?"} +] + +# --------------------------------------------------------------------------- +# Simulated task: generate a support response (replace with a real LLM call) +# --------------------------------------------------------------------------- +def generate_response(message) + case message + when /thank/i then "You're welcome! So glad you're enjoying it." + when /waiting|order/i then "I sincerely apologise for the delay. Let me look into this right away." + when /password|reset/i then "To reset your password, go to Settings > Account > Reset Password." + when /damaged|refund/i then "I'm sorry to hear that. I'll process your refund immediately." + else "Thanks for reaching out! Let me check on that for you." + end +end + +# --------------------------------------------------------------------------- +# Pattern 1: block-based single-label classifier +# +# Classifies each message into a single intent category. +# Declare only the kwargs you need — extras are filtered automatically. +# --------------------------------------------------------------------------- +intent_classifier = Braintrust::Classifier.new("intent") do |input:| + id = case input + when /thank/i then "praise" + when /waiting|order|update/i then "follow_up" + when /password|reset|find/i then "how_to" + when /damaged|refund/i then "complaint" + else "other" + end + + {name: "intent", id: id, label: id.tr("_", " ").capitalize} +end + +# --------------------------------------------------------------------------- +# Pattern 2: block-based multi-label classifier +# +# A single classifier can return an Array to assign multiple labels. +# All items sharing the same :name are grouped into the same results array. +# --------------------------------------------------------------------------- +tone_classifier = Braintrust::Classifier.new("tone") do |input:| + labels = [] + labels << {name: "tone", id: "urgent", label: "Urgent"} if input.match?(/immediately|unacceptable|waiting/i) + labels << {name: "tone", id: "polite", label: "Polite"} if input.match?(/please|thank|just checking/i) + labels << {name: "tone", id: "frustrated", label: "Frustrated"} if input.match?(/unacceptable|damaged|waiting/i) + labels << {name: "tone", id: "neutral", label: "Neutral"} if labels.empty? + labels +end + +# --------------------------------------------------------------------------- +# Pattern 3: class-based classifier +# +# Include Braintrust::Classifier and define #call with keyword args. +# The class name is snake_cased to derive the default classifier name +# (ResponseQualityClassifier -> "response_quality_classifier"). +# Override #name to customise it. +# --------------------------------------------------------------------------- +class ResponseQualityClassifier + include Braintrust::Classifier + + def name + "response_quality" + end + + def call(input:, output:) + word_count = output.to_s.split.length + + id = if output.to_s.strip.empty? + "no_response" + elsif word_count < 5 + "too_short" + elsif output.match?(/immediately|right away|look into/i) + "action_oriented" + else + "informational" + end + + { + name: "response_quality", + id: id, + label: id.tr("_", " ").capitalize, + metadata: {word_count: word_count} + } + end +end + +# --------------------------------------------------------------------------- +# Run the eval — classifiers only (no numeric scores needed here) +# --------------------------------------------------------------------------- +Braintrust::Eval.run( + project: "ruby-sdk-examples", + experiment: "classifiers-example", + cases: MESSAGES, + task: ->(input:) { generate_response(input) }, + classifiers: [intent_classifier, tone_classifier, ResponseQualityClassifier.new] +) + +OpenTelemetry.tracer_provider.shutdown From d80d83c206317e47c4f05c0bf59eac6b93acdd7d Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Wed, 22 Apr 2026 15:12:06 -0700 Subject: [PATCH 3/5] Fix server test evaluators to satisfy scorers/classifiers validation Co-Authored-By: Claude Sonnet 4.6 --- test/braintrust/contrib/rails/server/eval_controller_test.rb | 1 + test/braintrust/server/handlers/eval_test.rb | 1 + test/braintrust/server/rack/eval_endpoint_test.rb | 1 + test/braintrust/server/services/eval_service_test.rb | 1 + 4 files changed, 4 insertions(+) diff --git a/test/braintrust/contrib/rails/server/eval_controller_test.rb b/test/braintrust/contrib/rails/server/eval_controller_test.rb index 8eaaa54e..c7f2245c 100644 --- a/test/braintrust/contrib/rails/server/eval_controller_test.rb +++ b/test/braintrust/contrib/rails/server/eval_controller_test.rb @@ -159,6 +159,7 @@ def test_returns_401_when_auth_fails private def test_evaluator(**kwargs) + kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end diff --git a/test/braintrust/server/handlers/eval_test.rb b/test/braintrust/server/handlers/eval_test.rb index dce8a868..b93360dd 100644 --- a/test/braintrust/server/handlers/eval_test.rb +++ b/test/braintrust/server/handlers/eval_test.rb @@ -491,6 +491,7 @@ def test_handler_passes_parent_through private def test_evaluator(**kwargs) + kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end diff --git a/test/braintrust/server/rack/eval_endpoint_test.rb b/test/braintrust/server/rack/eval_endpoint_test.rb index a443e066..22d90654 100644 --- a/test/braintrust/server/rack/eval_endpoint_test.rb +++ b/test/braintrust/server/rack/eval_endpoint_test.rb @@ -197,6 +197,7 @@ def test_rejects_get private def test_evaluator(**kwargs) + kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end diff --git a/test/braintrust/server/services/eval_service_test.rb b/test/braintrust/server/services/eval_service_test.rb index 5320d5f7..b327103d 100644 --- a/test/braintrust/server/services/eval_service_test.rb +++ b/test/braintrust/server/services/eval_service_test.rb @@ -327,6 +327,7 @@ def test_build_state_evicts_oldest_when_cache_full private def test_evaluator(**kwargs) + kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end From dcca411d75ff912471651bba114d929278ceb884 Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Wed, 22 Apr 2026 22:30:53 -0700 Subject: [PATCH 4/5] Fix server tests to explicitly declare scorers rather than defaulting via helper Reverts the blanket `kwargs[:scorers] ||= [...]` added in d80d83c. Each test that actually runs an eval now explicitly passes `scorers: [noop_scorer]` where needed. Tests that only exercise validation paths (400/404) are unaffected. Co-Authored-By: Claude Sonnet 4.6 --- .../rails/server/eval_controller_test.rb | 11 ++++--- test/braintrust/server/handlers/eval_test.rb | 29 ++++++++++--------- .../server/rack/eval_endpoint_test.rb | 15 ++++++---- .../server/services/eval_service_test.rb | 9 ++++-- 4 files changed, 39 insertions(+), 25 deletions(-) diff --git a/test/braintrust/contrib/rails/server/eval_controller_test.rb b/test/braintrust/contrib/rails/server/eval_controller_test.rb index c7f2245c..93bd8ad0 100644 --- a/test/braintrust/contrib/rails/server/eval_controller_test.rb +++ b/test/braintrust/contrib/rails/server/eval_controller_test.rb @@ -24,7 +24,7 @@ def app end def test_streams_sse_events_for_inline_data - @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer]) reset_engine!(evaluators: @evaluators, auth: :none) post_json "/eval", { @@ -53,7 +53,7 @@ def test_streams_sse_events_for_inline_data end def test_progress_events_contain_output - @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer]) reset_engine!(evaluators: @evaluators, auth: :none) post_json "/eval", { @@ -94,7 +94,7 @@ def test_summary_event_contains_scores end def test_error_still_emits_progress_and_done - @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "task exploded" }) + @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "task exploded" }, scorers: [noop_scorer]) reset_engine!(evaluators: @evaluators, auth: :none) post_json "/eval", { @@ -159,10 +159,13 @@ def test_returns_401_when_auth_fails private def test_evaluator(**kwargs) - kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def post_json(path, body) post path, JSON.generate(body), {"CONTENT_TYPE" => "application/json"} end diff --git a/test/braintrust/server/handlers/eval_test.rb b/test/braintrust/server/handlers/eval_test.rb index b93360dd..884d5199 100644 --- a/test/braintrust/server/handlers/eval_test.rb +++ b/test/braintrust/server/handlers/eval_test.rb @@ -59,7 +59,7 @@ def test_returns_400_for_multiple_data_sources # --- SSE streaming --- def test_returns_200_with_sse_content_type - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) status, headers, _ = handler.call(rack_json_env( {name: "test-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"}, @@ -73,7 +73,7 @@ def test_returns_200_with_sse_content_type end def test_streams_progress_event_per_case - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "upcase-eval", data: {data: [{input: "a"}, {input: "b"}, {input: "c"}]}, experiment_name: "exp"}, @@ -88,7 +88,7 @@ def test_streams_progress_event_per_case end def test_progress_event_contains_protocol_fields - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "upcase-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"}, @@ -108,7 +108,7 @@ def test_progress_event_contains_protocol_fields end def test_progress_event_contains_task_output_as_json_string - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "upcase-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"}, @@ -165,7 +165,7 @@ def test_summary_event_contains_scores_and_experiment_name end def test_stream_ends_with_done - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -177,7 +177,7 @@ def test_stream_ends_with_done end def test_task_error_still_emits_progress_and_done - @evaluators["failing-eval"] = test_evaluator(task: -> { raise "boom" }) + @evaluators["failing-eval"] = test_evaluator(task: -> { raise "boom" }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "failing-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -191,7 +191,7 @@ def test_task_error_still_emits_progress_and_done end def test_task_error_progress_contains_error_event - @evaluators["failing-eval"] = test_evaluator(task: -> { raise "task exploded" }) + @evaluators["failing-eval"] = test_evaluator(task: -> { raise "task exploded" }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "failing-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -239,7 +239,7 @@ def test_accepts_dataset_id_as_sole_data_source # --- Auth passthrough --- def test_build_state_returns_nil_without_auth - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) env = rack_json_env( {name: "test-eval", data: {data: [{input: "hello"}]}}, @@ -252,7 +252,7 @@ def test_build_state_returns_nil_without_auth end def test_build_state_returns_nil_for_non_hash_auth - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) env = rack_json_env( {name: "test-eval", data: {data: [{input: "hello"}]}}, @@ -442,7 +442,7 @@ def test_handler_resolves_scores_to_scorer_ids # --- Server-specific body selection --- def test_returns_sse_body_without_protocol_http_request - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -453,7 +453,7 @@ def test_returns_sse_body_without_protocol_http_request end def test_returns_sse_stream_body_with_protocol_http_request - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) env = rack_json_env( {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -470,7 +470,7 @@ def test_returns_sse_stream_body_with_protocol_http_request # --- Parent passthrough --- def test_handler_passes_parent_through - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( { @@ -491,10 +491,13 @@ def test_handler_passes_parent_through private def test_evaluator(**kwargs) - kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def handler Braintrust::Server::Handlers::Eval.new(@evaluators) end diff --git a/test/braintrust/server/rack/eval_endpoint_test.rb b/test/braintrust/server/rack/eval_endpoint_test.rb index 22d90654..8e943ffe 100644 --- a/test/braintrust/server/rack/eval_endpoint_test.rb +++ b/test/braintrust/server/rack/eval_endpoint_test.rb @@ -21,7 +21,7 @@ def app end def test_streams_sse_events_for_inline_data - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) post_json "/eval", { name: "upcase-eval", @@ -52,7 +52,7 @@ def test_streams_sse_events_for_inline_data end def test_progress_events_contain_output - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) post_json "/eval", { name: "upcase-eval", @@ -94,7 +94,8 @@ def test_summary_event_contains_scores def test_error_still_emits_progress_and_done @evaluators["failing-eval"] = test_evaluator( - task: -> { raise "task exploded" } + task: -> { raise "task exploded" }, + scorers: [noop_scorer] ) post_json "/eval", { @@ -167,7 +168,8 @@ def test_parameters_forwarded_to_task task: ->(input:, parameters:) { prefix = parameters["greeting"] || "hey" "#{prefix} #{input}" - } + }, + scorers: [noop_scorer] ) post_json "/eval", { @@ -197,10 +199,13 @@ def test_rejects_get private def test_evaluator(**kwargs) - kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def post_json(path, body) post path, JSON.generate(body), {"CONTENT_TYPE" => "application/json"} end diff --git a/test/braintrust/server/services/eval_service_test.rb b/test/braintrust/server/services/eval_service_test.rb index b327103d..4f5a32d1 100644 --- a/test/braintrust/server/services/eval_service_test.rb +++ b/test/braintrust/server/services/eval_service_test.rb @@ -92,7 +92,7 @@ def test_validate_accepts_dataset_name # --- stream --- def test_stream_emits_progress_and_done_events - @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer]) s = service validated = s.validate({ "name" => "upcase-eval", @@ -129,7 +129,7 @@ def test_stream_emits_summary_with_scores end def test_stream_emits_error_progress_on_task_failure - @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "boom" }) + @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "boom" }, scorers: [noop_scorer]) s = service validated = s.validate({ "name" => "failing-eval", @@ -327,10 +327,13 @@ def test_build_state_evicts_oldest_when_cache_full private def test_evaluator(**kwargs) - kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }] Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def collect_streamed_events(svc, validated, auth: nil) chunks = [] sse = Braintrust::Server::SSEWriter.new { |chunk| chunks << chunk } From 541e6e6cb13421267961d7ef867b383e3d57db46 Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Tue, 5 May 2026 15:26:56 -0700 Subject: [PATCH 5/5] Surface classifier errors via result.errors Mirror the scorer error pattern by pushing classifier failures onto the errors queue, so they appear in result.errors (and result.success? is false) in addition to being recorded in span metadata. Addresses PR #154 review feedback. --- lib/braintrust/eval/runner.rb | 3 +++ test/braintrust/eval/runner_test.rb | 10 +++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index 0cd249d8..074415ff 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -128,6 +128,9 @@ def run_eval_case(kase, errors) existing_metadata = kase.metadata || {} classifier_errors_metadata = existing_metadata.merge(classifier_errors: classifier_errors) set_json_attr(eval_span, "braintrust.metadata", classifier_errors_metadata) + classifier_errors.each do |classifier_name, message| + errors << "Classifier '#{classifier_name}' failed for input '#{kase.input}': #{message}" + end end # Set output after task completes diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb index 28eb95a4..7b021236 100644 --- a/test/braintrust/eval/runner_test.rb +++ b/test/braintrust/eval/runner_test.rb @@ -2145,8 +2145,10 @@ def test_runner_classifier_error_does_not_abort_eval ) result = Braintrust::Eval::Runner.new(context).run - # Eval succeeds — classifier errors don't add to errors queue - assert result.success? + # Eval continues running task and scorers, but classifier errors are surfaced. + refute result.success? + assert_equal 1, result.errors.length + assert_match(/Classifier 'broken' failed for input 'hello': classifier boom/, result.errors.first) assert_equal [1.0], result.scores["always_one"] assert_nil result.classifications end @@ -2168,7 +2170,9 @@ def test_runner_classifier_error_does_not_affect_other_classifiers ) result = Braintrust::Eval::Runner.new(context).run - assert result.success? + refute result.success? + assert_equal 1, result.errors.length + assert_match(/Classifier 'broken' failed for input 'hello': boom/, result.errors.first) assert_equal({"working" => [{id: "ok"}]}, result.classifications) end