From e058e4ddc48840065ff2199d319bc6246acfbf24 Mon Sep 17 00:00:00 2001
From: Stephen Belanger <admin@stephenbelanger.com>
Date: Wed, 22 Apr 2026 10:43:34 -0700
Subject: [PATCH 1/5] Add classifier support

---
 lib/braintrust/classifier.rb                  | 157 ++++++++++
 lib/braintrust/eval.rb                        |  35 ++-
 lib/braintrust/eval/context.rb                |  47 ++-
 lib/braintrust/eval/evaluator.rb              |  21 +-
 lib/braintrust/eval/result.rb                 |   6 +-
 lib/braintrust/eval/runner.rb                 | 113 +++++++-
 .../server/services/list_service.rb           |   5 +
 test/braintrust/classifier_test.rb            | 226 +++++++++++++++
 test/braintrust/eval/evaluator_test.rb        |   5 +-
 test/braintrust/eval/runner_test.rb           | 269 ++++++++++++++++++
 test/braintrust/eval_test.rb                  |  48 ++++
 test/support/braintrust_helper.rb             |   6 +-
 12 files changed, 901 insertions(+), 37 deletions(-)
 create mode 100644 lib/braintrust/classifier.rb
 create mode 100644 test/braintrust/classifier_test.rb
diff --git a/lib/braintrust/classifier.rb b/lib/braintrust/classifier.rb
new file mode 100644
index 00000000..089de302
--- /dev/null
+++ b/lib/braintrust/classifier.rb
@@ -0,0 +1,157 @@
+# frozen_string_literal: true
+
+require_relative "internal/callable"
+
+module Braintrust
+  # Classifier wraps a classification function that categorizes and labels eval outputs.
+  #
+  # Unlike scorers (which return numeric 0-1 values), classifiers return structured
+  # {Classification} items with an id and optional label and metadata.
+  #
+  # Use inline with a block (keyword args):
+  #   classifier = Classifier.new("category") { |output:| {name: "category", id: "greeting", label: "Greeting"} }
+  #
+  # Or include in a class and define #call with keyword args:
+  #   class CategoryClassifier
+  #     include Braintrust::Classifier
+  #
+  #     def call(output:)
+  #       {name: "category", id: "greeting", label: "Greeting"}
+  #     end
+  #   end
+  #
+  # Classifiers may return a single Classification hash, an Array of them, or nil
+  # (meaning no classifications for this case).
+  module Classifier
+    DEFAULT_NAME = "classifier"
+
+    # @param base [Class] the class including Classifier
+    def self.included(base)
+      base.include(Callable)
+    end
+
+    # Create a block-based classifier.
+    #
+    # @param name [String, nil] optional name (defaults to "classifier")
+    # @param block [Proc] the classification implementation; declare only the keyword
+    #   args you need. Extra kwargs are filtered out automatically.
+    #
+    #   Supported kwargs: +input:+, +expected:+, +output:+, +metadata:+, +trace:+, +parameters:+
+    # @return [Classifier::Block]
+    # @raise [ArgumentError] if the block has unsupported arity
+    def self.new(name = nil, &block)
+      Block.new(name: name || DEFAULT_NAME, &block)
+    end
+
+    # Included into classes that +include Classifier+. Prepends KeywordFilter and
+    # ClassificationNormalizer so #call receives only declared kwargs and always returns
+    # Array<Hash>. Also provides a default #name and #call_parameters.
+    module Callable
+      # Normalizes the raw return value of #call into Array<Hash>.
+      # Nested inside Callable because it depends on #name which Callable provides.
+      module ClassificationNormalizer
+        # @return [Array<Hash>] normalized classification hashes with :name, :id, and optional :label, :metadata keys
+        def call(**kwargs)
+          normalize_classification_result(super)
+        end
+
+        private
+
+        # @param result [Hash, Array<Hash>, nil] raw return value from #call
+        # @return [Array<Hash>] zero or more classification hashes with :name, :id keys
+        # @raise [ArgumentError] if any item is not a non-empty object
+        def normalize_classification_result(result)
+          case result
+          when nil then []
+          when Array then result.map { |item| normalize_classification_item(item) }
+          when Hash then [normalize_classification_item(result)]
+          else
+            raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{result.inspect}"
+          end
+        end
+
+        # Fills in missing :name from the classifier, validates :id.
+        # @param item [Hash] a classification hash
+        # @return [Hash] the item with :name defaulted and validated
+        # @raise [ArgumentError] if item is not a non-empty Hash
+        def normalize_classification_item(item)
+          unless item.is_a?(Hash) && !item.empty?
+            raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{item.inspect}"
+          end
+
+          # :name defaults to the classifier's resolved name when missing, empty, or non-string
+          unless item[:name].is_a?(String) && !item[:name].empty?
+            item = item.merge(name: name)
+          end
+
+          item
+        end
+      end
+
+      # Infrastructure modules prepended onto every classifier class.
+      # Used both to set up the ancestor chain and to skip past them in
+      # #call_parameters so KeywordFilter sees the real call signature.
+      PREPENDED = [Internal::Callable::KeywordFilter, ClassificationNormalizer].freeze
+
+      # @param base [Class] the class including Callable
+      def self.included(base)
+        PREPENDED.each { |mod| base.prepend(mod) }
+      end
+
+      # Default name derived from the class name (e.g. CategoryClassifier -> "category_classifier").
+      # @return [String]
+      def name
+        klass = self.class.name&.split("::")&.last
+        return Classifier::DEFAULT_NAME unless klass
+        klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
+      end
+
+      # Provides KeywordFilter with the actual call signature of the subclass.
+      # Walks past PREPENDED modules in the ancestor chain so that user-defined
+      # #call keyword params are correctly introspected.
+      # Block overrides this to point directly at @block.parameters.
+      # @return [Array<Array>] parameter list
+      def call_parameters
+        meth = method(:call)
+        meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner)
+        meth.parameters
+      end
+    end
+
+    # Block-based classifier. Stores a Proc and delegates #call to it.
+    # Includes Classifier so it satisfies +Classifier ===+ checks.
+    # Exposes #call_parameters so KeywordFilter can introspect the block's
+    # declared kwargs rather than Block#call's **kwargs signature.
+    class Block
+      include Classifier
+
+      # @return [String]
+      attr_reader :name
+
+      # @param name [String] classifier name
+      # @param block [Proc] classification implementation; must use keyword args or zero-arity
+      # @raise [ArgumentError] if the block uses positional params
+      def initialize(name: DEFAULT_NAME, &block)
+        @name = name
+        params = block.parameters
+        unless Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0
+          raise ArgumentError, "Classifier block must use keyword args (got arity #{block.arity})"
+        end
+        @block = block
+      end
+
+      # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
+      # @return [Array<Hash>] normalized classification results
+      def call(**kwargs)
+        @block.call(**kwargs)
+      end
+
+      # Exposes the block's parameter list so KeywordFilter can filter
+      # kwargs to match the block's declared keywords.
+      # @return [Array<Array>] parameter list from Proc#parameters
+      def call_parameters
+        @block.parameters
+      end
+    end
+  end
+end
diff --git a/lib/braintrust/eval.rb b/lib/braintrust/eval.rb
index d661eedc..ac27b48b 100644
--- a/lib/braintrust/eval.rb
+++ b/lib/braintrust/eval.rb
@@ -1,5 +1,6 @@
 # frozen_string_literal: true
 
+require_relative "classifier"
 require_relative "scorer"
 require_relative "task"
 require_relative "functions"
@@ -160,7 +161,10 @@ def scorer(name, callable = nil, &block)
       #   - String: dataset name (fetches from same project)
       #   - Hash: {name:, id:, project:, version:, limit:}
       # @param task [#call] The task to evaluate (must be callable)
-      # @param scorers [Array<String, Scorer, #call>] The scorers to use (String names, Scorer objects, or callables)
+      # @param scorers [Array<String, Scorer, #call>, nil] The scorers to use (String names, Scorer objects, or callables).
+      #   At least one of scorers or classifiers must be provided.
+      # @param classifiers [Array<Classifier, #call>, nil] The classifiers to use.
+      #   At least one of scorers or classifiers must be provided.
       # @param on_progress [#call, nil] Optional callback fired after each test case.
       #   Receives a Hash: {"data" => output, "scores" => {name => value}} on success,
       #   or {"error" => message} on failure.
@@ -177,13 +181,16 @@ def scorer(name, callable = nil, &block)
       # @param parent [Hash, nil] Parent span context ({object_type:, object_id:, generation:})
       # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
       # @return [Result]
-      def run(task:, scorers:, project: nil, experiment: nil,
-        cases: nil, dataset: nil, on_progress: nil,
+      def run(task:, scorers: nil, classifiers: nil, project: nil,
+        experiment: nil, cases: nil, dataset: nil, on_progress: nil,
         parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
         state: nil, tracer_provider: nil, project_id: nil, parent: nil,
         parameters: nil)
         # Validate required parameters
-        validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
+        validate_params!(task: task, scorers: scorers,
+          classifiers: classifiers, cases: cases, dataset: dataset)
+        scorers ||= []
+        classifiers ||= []
 
         experiment_id = nil
         project_name = project
@@ -216,6 +223,7 @@ def run(task:, scorers:, project: nil, experiment: nil,
         context = Context.build(
           task: task,
           scorers: scorers,
+          classifiers: classifiers,
           cases: cases,
           experiment_id: experiment_id,
           experiment_name: experiment,
@@ -245,9 +253,19 @@ def print_result(result)
 
       # Validate required parameters
       # @raise [ArgumentError] if validation fails
-      def validate_params!(task:, scorers:, cases:, dataset:)
+      def validate_params!(task:, scorers:, classifiers:, cases:, dataset:)
         raise ArgumentError, "task is required" unless task
-        raise ArgumentError, "scorers is required" unless scorers
+
+        # Validate task is callable before anything else
+        unless task.respond_to?(:call)
+          raise ArgumentError, "task must be callable (respond to :call)"
+        end
+
+        has_scorers = scorers && !scorers.empty?
+        has_classifiers = classifiers && !classifiers.empty?
+        unless has_scorers || has_classifiers
+          raise ArgumentError, "at least one of scorers or classifiers is required"
+        end
 
         # Validate cases and dataset are mutually exclusive
         if cases && dataset
@@ -258,11 +276,6 @@ def validate_params!(task:, scorers:, cases:, dataset:)
         unless cases || dataset
           raise ArgumentError, "must specify either 'cases' or 'dataset'"
         end
-
-        # Validate task is callable
-        unless task.respond_to?(:call)
-          raise ArgumentError, "task must be callable (respond to :call)"
-        end
       end
 
       # Resolve project by name or ID. Creates if needed.
diff --git a/lib/braintrust/eval/context.rb b/lib/braintrust/eval/context.rb
index ebcca050..02717267 100644
--- a/lib/braintrust/eval/context.rb
+++ b/lib/braintrust/eval/context.rb
@@ -1,18 +1,20 @@
 # frozen_string_literal: true
 
 require_relative "cases"
+require_relative "../classifier"
 
 module Braintrust
   module Eval
     # Holds all normalized, ready-to-execute eval components.
     # Use Context.build to construct from raw user inputs.
     class Context
-      attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name,
-        :project_id, :project_name, :state, :tracer_provider,
+      attr_reader :task, :scorers, :classifiers, :cases, :experiment_id,
+        :experiment_name, :project_id, :project_name, :state, :tracer_provider,
         :on_progress, :parent_span_attr, :generation, :parameters
 
       # @param task [Task] Normalized task wrapper
       # @param scorers [Array<Scorer>] Normalized scorer wrappers
+      # @param classifiers [Array<Classifier>] Normalized classifier wrappers
       # @param cases [Cases] Normalized eval cases
       # @param experiment_id [String, nil] Experiment ID for logging and trace linkage
       # @param experiment_name [String, nil] Experiment name, included in span attributes
@@ -24,11 +26,13 @@ class Context
       # @param parent_span_attr [String, nil] Formatted parent span identifier ("type:id"), linking spans to a parent context
       # @param generation [Integer, nil] Generation number from the parent span context, used to link spans in a trace hierarchy
       # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
-      def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
-        project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
-        on_progress: nil, parent_span_attr: nil, generation: nil, parameters: nil)
+      def initialize(task:, scorers:, cases:, classifiers: [],
+        experiment_id: nil, experiment_name: nil, project_id: nil,
+        project_name: nil, state: nil, tracer_provider: nil, on_progress: nil,
+        parent_span_attr: nil, generation: nil, parameters: nil)
         @task = task
         @scorers = scorers
+        @classifiers = classifiers
         @cases = cases
         @experiment_id = experiment_id
         @experiment_name = experiment_name
@@ -46,6 +50,7 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil
       # Delegates to Factory for normalization.
       # @param task [Task, Proc, #call] Task to evaluate; wrapped into a {Task} if needed
       # @param scorers [Array<Scorer, Proc, String, Scorer::ID, #call>] Scorers; each is normalized into a {Scorer}
+      # @param classifiers [Array<Classifier, Proc, #call>] Classifiers; each is normalized into a {Classifier}
       # @param cases [Cases, Array, Enumerable] Eval cases; wrapped into {Cases} if needed
       # @param experiment_id [String, nil] Experiment ID for logging
       # @param experiment_name [String, nil] Experiment name, included in span attributes
@@ -57,14 +62,15 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil
       # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
       # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
       # @return [Context]
-      def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
-        project_id: nil, project_name: nil, state: nil, tracer_provider: nil,
-        on_progress: nil, parent: nil, parameters: nil)
+      def self.build(task:, scorers:, cases:, classifiers: [],
+        experiment_id: nil, experiment_name: nil, project_id: nil,
+        project_name: nil, state: nil, tracer_provider: nil, on_progress: nil,
+        parent: nil, parameters: nil)
         Factory.new(
           state: state, tracer_provider: tracer_provider,
           project_id: project_id, project_name: project_name
         ).build(
-          task: task, scorers: scorers, cases: cases,
+          task: task, scorers: scorers, classifiers: classifiers, cases: cases,
           experiment_id: experiment_id, experiment_name: experiment_name,
           on_progress: on_progress, parent: parent, parameters: parameters
         )
@@ -86,17 +92,19 @@ def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name:
         # Normalize raw inputs and construct a {Context}.
         # @param task [Task, Proc, #call] Raw task
         # @param scorers [Array] Raw scorers
+        # @param classifiers [Array] Raw classifiers
         # @param cases [Cases, Array, Enumerable] Raw eval cases
         # @param experiment_id [String, nil]
         # @param experiment_name [String, nil]
         # @param on_progress [Proc, nil]
         # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation
         # @return [Context]
-        def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil,
-          on_progress: nil, parent: nil, parameters: nil)
+        def build(task:, scorers:, cases:, classifiers: [], experiment_id: nil,
+          experiment_name: nil, on_progress: nil, parent: nil, parameters: nil)
           Context.new(
             task: normalize_task(task),
             scorers: normalize_scorers(scorers),
+            classifiers: normalize_classifiers(classifiers),
             cases: normalize_cases(cases),
             experiment_id: experiment_id,
             experiment_name: experiment_name,
@@ -188,6 +196,23 @@ def normalize_scorers(raw)
             end
           end
         end
+
+        # @param raw [Array<Classifier, Proc, #call>]
+        # @return [Array<Classifier>]
+        def normalize_classifiers(raw)
+          raw.map do |classifier|
+            case classifier
+            when Braintrust::Classifier
+              classifier
+            when Proc
+              # Pass Proc/Lambda directly to preserve keyword arg info
+              Braintrust::Classifier.new(&classifier)
+            else
+              name = classifier.respond_to?(:name) ? classifier.name : nil
+              Braintrust::Classifier.new(name, &classifier.method(:call))
+            end
+          end
+        end
       end
     end
   end
diff --git a/lib/braintrust/eval/evaluator.rb b/lib/braintrust/eval/evaluator.rb
index a5f135bc..6798f31a 100644
--- a/lib/braintrust/eval/evaluator.rb
+++ b/lib/braintrust/eval/evaluator.rb
@@ -40,11 +40,12 @@ module Eval
     #     }
     #   )
     class Evaluator
-      attr_accessor :task, :scorers, :parameters
+      attr_accessor :task, :scorers, :classifiers, :parameters
 
-      def initialize(task: nil, scorers: [], parameters: {})
+      def initialize(task: nil, scorers: [], classifiers: [], parameters: {})
         @task = task
         @scorers = scorers
+        @classifiers = classifiers
         @parameters = parameters
       end
 
@@ -68,6 +69,7 @@ def validate!
       # @param project_id [String, nil] Project UUID (skips project creation)
       # @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch
       # @param scorers [Array, nil] Additional scorers (merged with evaluator's own)
+      # @param classifiers [Array, nil] Additional classifiers (merged with evaluator's own)
       # @param parent [Hash, nil] Parent span context
       # @param state [State, nil] Braintrust state
       # @param update [Boolean] If true, allow reusing existing experiment (default: false)
@@ -75,16 +77,19 @@ def validate!
       # @return [Result]
       def run(cases, on_progress: nil, quiet: false,
         project: nil, experiment: nil, project_id: nil,
-        dataset: nil, scorers: nil, parent: nil,
+        dataset: nil, scorers: nil, classifiers: nil, parent: nil,
         state: nil, update: false, tracer_provider: nil,
         parameters: nil)
         all_scorers = scorers ? self.scorers + scorers : self.scorers
+        all_classifiers = classifiers ?
+          self.classifiers + classifiers :
+          self.classifiers
         Braintrust::Eval.run(
-          task: task, scorers: all_scorers, cases: cases, dataset: dataset,
-          project: project, experiment: experiment, project_id: project_id,
-          parent: parent, on_progress: on_progress, quiet: quiet,
-          state: state, update: update, tracer_provider: tracer_provider,
-          parameters: parameters
+          task: task, scorers: all_scorers, classifiers: all_classifiers,
+          cases: cases, dataset: dataset, project: project,
+          experiment: experiment, project_id: project_id, parent: parent,
+          on_progress: on_progress, quiet: quiet, state: state, update: update,
+          tracer_provider: tracer_provider, parameters: parameters
         )
       end
     end
diff --git a/lib/braintrust/eval/result.rb b/lib/braintrust/eval/result.rb
index c18af302..7af7132f 100644
--- a/lib/braintrust/eval/result.rb
+++ b/lib/braintrust/eval/result.rb
@@ -9,7 +9,7 @@ module Eval
     # Contains experiment metadata, errors, timing information, and raw score data
     class Result
       attr_reader :experiment_id, :experiment_name, :project_id, :project_name,
-        :permalink, :errors, :duration, :scores
+        :permalink, :errors, :duration, :scores, :classifications
 
       # Create a new result
       # @param experiment_id [String] The experiment ID
@@ -20,8 +20,9 @@ class Result
       # @param errors [Array<String>] List of errors that occurred
       # @param duration [Float] Duration in seconds
       # @param scores [Hash, nil] Raw score data { scorer_name => Array<Numeric> }
+      # @param classifications [Hash, nil] Classification results { name => Array<ClassificationItem> }, nil when no classifiers ran
       def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
-        permalink:, errors:, duration:, scores: nil)
+        permalink:, errors:, duration:, scores: nil, classifications: nil)
         @experiment_id = experiment_id
         @experiment_name = experiment_name
         @project_id = project_id
@@ -30,6 +31,7 @@ def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
         @errors = errors
         @duration = duration
         @scores = scores
+        @classifications = classifications
       end
 
       # Check if the evaluation was successful (no errors)
diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb
index f461e041..0cd249d8 100644
--- a/lib/braintrust/eval/runner.rb
+++ b/lib/braintrust/eval/runner.rb
@@ -27,8 +27,9 @@ def initialize(eval_context)
         @eval_context = eval_context
         @tracer = eval_context.tracer_provider.tracer("braintrust-eval")
 
-        # Mutex for thread-safe score collection
+        # Mutexes for thread-safe result collection
         @score_mutex = Mutex.new
+        @classification_mutex = Mutex.new
       end
 
       # Run evaluation and return Result
@@ -39,6 +40,7 @@ def run(parallelism: 1)
         eval_cases = eval_context.cases
         errors = Queue.new
         @scores = {} # Reset for each run: { scorer_name => Array<Numeric> }
+        @classifications = {} # Reset for each run: { classifier_name => Array<ClassificationItem> }
 
         if parallelism && parallelism > 1
           Internal::ThreadPool.each(eval_cases, parallelism: parallelism) do |eval_case|
@@ -69,7 +71,8 @@ def run(parallelism: 1)
           permalink: permalink,
           errors: error_array,
           duration: duration,
-          scores: @scores
+          scores: @scores,
+          classifications: @classifications.empty? ? nil : @classifications
         )
       end
 
@@ -119,6 +122,14 @@ def run_eval_case(kase, errors)
             errors << "Scorers failed for input '#{kase.input}': #{e.message}"
           end
 
+          # Run classifiers (independent of scorers; errors do not abort eval)
+          classifier_errors = run_classifiers(kase, eval_span)
+          unless classifier_errors.empty?
+            existing_metadata = kase.metadata || {}
+            classifier_errors_metadata = existing_metadata.merge(classifier_errors: classifier_errors)
+            set_json_attr(eval_span, "braintrust.metadata", classifier_errors_metadata)
+          end
+
           # Set output after task completes
           set_json_attr(eval_span, "braintrust.output_json", {output: kase.output})
 
@@ -318,6 +329,104 @@ def collect_scores(score_results)
           score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
         end
       end
+
+      # Run all classifiers for a case. Classifier errors are non-fatal and stored in metadata.
+      # @param kase [CaseContext] The per-case context (output must be populated)
+      # @param eval_span [OpenTelemetry::Trace::Span] The eval span for this case
+      # @return [Hash] classifier_errors map (name -> error message), empty if no errors
+      def run_classifiers(kase, eval_span)
+        return {} if eval_context.classifiers.empty?
+
+        classifier_kwargs = {
+          input: kase.input,
+          expected: kase.expected,
+          output: kase.output,
+          metadata: kase.metadata || {},
+          trace: kase.trace,
+          parameters: eval_context.parameters || {}
+        }
+        classifier_input = {
+          input: kase.input,
+          expected: kase.expected,
+          output: kase.output,
+          metadata: kase.metadata || {},
+          parameters: eval_context.parameters || {}
+        }
+
+        case_classifications = {}
+        classifier_errors = {}
+
+        eval_context.classifiers.each_with_index do |classifier, index|
+          classifier_name = classifier.name || "classifier_#{index}"
+          begin
+            results = run_classifier(classifier, classifier_kwargs, classifier_input)
+            results.each do |item|
+              item_name = item[:name]
+              classification_item = item.except(:name)
+              (case_classifications[item_name] ||= []) << classification_item
+            end
+            collect_classifications(results)
+          rescue => e
+            Braintrust::Log.warn("[Classifier] #{classifier_name} failed: #{e.message}")
+            classifier_errors[classifier_name] = e.message
+          end
+        end
+
+        unless case_classifications.empty?
+          set_json_attr(eval_span, "braintrust.classifications", case_classifications)
+        end
+
+        classifier_errors
+      end
+
+      # Run a single classifier inside its own span.
+      # @param classifier [Classifier] The classifier to run
+      # @param classifier_kwargs [Hash] Keyword arguments for the classifier
+      # @param classifier_input [Hash] Input to log on the span
+      # @return [Array<Hash>] Normalized classification results from the classifier
+      def run_classifier(classifier, classifier_kwargs, classifier_input)
+        tracer.in_span(classifier.name) do |classifier_span|
+          classifier_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
+          set_json_attr(classifier_span, "braintrust.span_attributes", build_classifier_span_attributes(classifier.name))
+          set_json_attr(classifier_span, "braintrust.input_json", classifier_input)
+
+          classification_results = classifier.call(**classifier_kwargs)
+
+          # Build output dict keyed by name -> array of items (for span logging)
+          output_by_name = {}
+          classification_results.each do |item|
+            (output_by_name[item[:name]] ||= []) << item.except(:name)
+          end
+
+          set_json_attr(classifier_span, "braintrust.output_json", output_by_name)
+
+          classification_results
+        rescue => e
+          record_span_error(classifier_span, e, "ClassifierError")
+          raise
+        end
+      end
+
+      # Build span_attributes for a classifier span.
+      # @param classifier_name [String] The classifier name
+      # @return [Hash]
+      def build_classifier_span_attributes(classifier_name)
+        attrs = {type: "classifier", name: classifier_name, purpose: "scorer"}
+        attrs[:generation] = eval_context.generation if eval_context.generation
+        attrs
+      end
+
+      # Collect classification results into the global accumulator (thread-safe).
+      # Converts Classification to ClassificationItem by dropping :name.
+      # @param classification_results [Array<Hash>] Classification results from a classifier
+      def collect_classifications(classification_results)
+        @classification_mutex.synchronize do
+          classification_results.each do |item|
+            item_name = item[:name]
+            (@classifications[item_name] ||= []) << item.except(:name)
+          end
+        end
+      end
     end
   end
 end
diff --git a/lib/braintrust/server/services/list_service.rb b/lib/braintrust/server/services/list_service.rb
index 06bd7add..8c29c6d3 100644
--- a/lib/braintrust/server/services/list_service.rb
+++ b/lib/braintrust/server/services/list_service.rb
@@ -20,6 +20,11 @@ def call
               {"name" => scorer_name}
             end
             entry = {"scores" => scores}
+            classifiers = (evaluator.classifiers || []).each_with_index.map do |classifier, i|
+              classifier_name = classifier.respond_to?(:name) ? classifier.name : "classifier_#{i}"
+              {"name" => classifier_name}
+            end
+            entry["classifiers"] = classifiers unless classifiers.empty?
             params = serialize_parameters(evaluator.parameters)
             entry["parameters"] = params if params
             result[name] = entry
diff --git a/test/braintrust/classifier_test.rb b/test/braintrust/classifier_test.rb
new file mode 100644
index 00000000..8dfe4501
--- /dev/null
+++ b/test/braintrust/classifier_test.rb
@@ -0,0 +1,226 @@
+# frozen_string_literal: true
+
+require "test_helper"
+require "braintrust/classifier"
+
+class Braintrust::ClassifierTest < Minitest::Test
+  # ============================================
+  # Classifier.new with block (inline classifiers)
+  # ============================================
+
+  def test_classifier_with_kwargs_block
+    classifier = Braintrust::Classifier.new("category") do |output:, **|
+      {name: "category", id: "greeting", label: "Greeting"}
+    end
+
+    assert_equal "category", classifier.name
+    result = classifier.call(input: "hello", expected: nil, output: "hello")
+    assert_equal [{name: "category", id: "greeting", label: "Greeting"}], result
+  end
+
+  def test_classifier_with_subset_kwargs_filters_extra_keys
+    classifier = Braintrust::Classifier.new("category") do |output:|
+      {name: "category", id: "word"}
+    end
+
+    result = classifier.call(input: "x", expected: nil, output: "hello", metadata: {}, tags: ["t"])
+    assert_equal [{name: "category", id: "word"}], result
+  end
+
+  def test_classifier_returns_nil_produces_empty_array
+    classifier = Braintrust::Classifier.new("maybe") { |**| nil }
+    assert_equal [], classifier.call(output: "hello")
+  end
+
+  def test_classifier_returns_array_of_classifications
+    classifier = Braintrust::Classifier.new("sentiment") do |**|
+      [
+        {name: "sentiment", id: "positive", label: "Positive"},
+        {name: "sentiment", id: "enthusiastic", label: "Enthusiastic"}
+      ]
+    end
+
+    result = classifier.call(output: "great!")
+    assert_equal 2, result.length
+    assert_equal({name: "sentiment", id: "positive", label: "Positive"}, result[0])
+    assert_equal({name: "sentiment", id: "enthusiastic", label: "Enthusiastic"}, result[1])
+  end
+
+  def test_classifier_with_metadata
+    classifier = Braintrust::Classifier.new("category") do |**|
+      {name: "category", id: "greeting", label: "Greeting", metadata: {source: "unit-test"}}
+    end
+
+    result = classifier.call(output: "hello")
+    assert_equal [{name: "category", id: "greeting", label: "Greeting", metadata: {source: "unit-test"}}], result
+  end
+
+  # ============================================
+  # Name defaulting
+  # ============================================
+
+  def test_name_defaults_to_classifier_function_name_when_missing
+    classifier = Braintrust::Classifier.new("my_classifier") { |**|
+      {id: "foo"} # no :name key
+    }
+
+    result = classifier.call(output: "x")
+    assert_equal "my_classifier", result[0][:name]
+  end
+
+  def test_name_defaults_to_classifier_function_name_when_empty_string
+    classifier = Braintrust::Classifier.new("my_classifier") { |**|
+      {name: "", id: "foo"}
+    }
+
+    result = classifier.call(output: "x")
+    assert_equal "my_classifier", result[0][:name]
+  end
+
+  def test_name_defaults_to_classifier_function_name_when_not_a_string
+    classifier = Braintrust::Classifier.new("my_classifier") { |**|
+      {name: 42, id: "foo"}
+    }
+
+    result = classifier.call(output: "x")
+    assert_equal "my_classifier", result[0][:name]
+  end
+
+  def test_explicit_name_in_result_takes_precedence
+    classifier = Braintrust::Classifier.new("my_classifier") { |**|
+      {name: "override_name", id: "foo"}
+    }
+
+    result = classifier.call(output: "x")
+    assert_equal "override_name", result[0][:name]
+  end
+
+  # ============================================
+  # Validation
+  # ============================================
+
+  def test_classifier_non_empty_object_validation_nil_item
+    classifier = Braintrust::Classifier.new("bad") { |**| [nil] }
+
+    error = assert_raises(ArgumentError) do
+      classifier.call(output: "x")
+    end
+    assert_match(/each classification must be a non-empty object/, error.message)
+    assert_match(/nil/, error.message)
+  end
+
+  def test_classifier_non_empty_object_validation_empty_hash
+    classifier = Braintrust::Classifier.new("bad") { |**| {} }
+
+    error = assert_raises(ArgumentError) do
+      classifier.call(output: "x")
+    end
+    assert_match(/each classification must be a non-empty object/, error.message)
+  end
+
+  def test_classifier_non_empty_object_validation_string_item
+    classifier = Braintrust::Classifier.new("bad") { |**| ["not-a-hash"] }
+
+    error = assert_raises(ArgumentError) do
+      classifier.call(output: "x")
+    end
+    assert_match(/each classification must be a non-empty object/, error.message)
+  end
+
+  def test_classifier_non_empty_object_validation_non_hash_scalar
+    classifier = Braintrust::Classifier.new("bad") { |**| 42 }
+
+    error = assert_raises(ArgumentError) do
+      classifier.call(output: "x")
+    end
+    assert_match(/each classification must be a non-empty object/, error.message)
+  end
+
+  def test_classifier_positional_params_raises
+    error = assert_raises(ArgumentError) do
+      Braintrust::Classifier.new("bad") { |a, b| a }
+    end
+
+    assert_match(/classifier block must use keyword args/i, error.message)
+  end
+
+  # ============================================
+  # Name detection
+  # ============================================
+
+  def test_classifier_name_defaults_to_classifier_for_base_class
+    classifier = Braintrust::Classifier.new { |**| {id: "x"} }
+    assert_equal "classifier", classifier.name
+  end
+
+  def test_classifier_explicit_name_takes_precedence
+    classifier = Braintrust::Classifier.new("my_name") { |**| {id: "x"} }
+    assert_equal "my_name", classifier.name
+  end
+
+  # ============================================
+  # Subclass pattern
+  # ============================================
+
+  def test_subclass_with_call_override
+    klass = Class.new do
+      include Braintrust::Classifier
+
+      def call(output:)
+        {name: "category", id: output.empty? ? "empty" : "nonempty"}
+      end
+    end
+
+    classifier = klass.new
+    assert_kind_of Braintrust::Classifier, classifier
+
+    result = classifier.call(input: "x", expected: nil, output: "hello")
+    assert_equal [{name: "category", id: "nonempty"}], result
+
+    result2 = classifier.call(input: "x", expected: nil, output: "")
+    assert_equal [{name: "category", id: "empty"}], result2
+  end
+
+  def test_subclass_with_name_override
+    klass = Class.new do
+      include Braintrust::Classifier
+
+      def name
+        "custom_classifier"
+      end
+
+      def call(**)
+        {id: "foo"}
+      end
+    end
+
+    classifier = klass.new
+    assert_equal "custom_classifier", classifier.name
+  end
+
+  def test_subclass_name_derived_from_class_name
+    klass = Class.new do
+      include Braintrust::Classifier
+
+      def call(**)
+        {id: "foo"}
+      end
+    end
+
+    Braintrust.stub_const(:FuzzyMatchTestClassifier, klass) do
+      classifier = klass.new
+      assert_equal "fuzzy_match_test_classifier", classifier.name
+    end
+  end
+
+  def test_subclass_without_call_raises_on_call
+    klass = Class.new do
+      include Braintrust::Classifier
+    end
+    classifier = klass.new
+
+    assert_raises(NoMethodError) do
+      classifier.call(output: "x")
+    end
+  end
+end
diff --git a/test/braintrust/eval/evaluator_test.rb b/test/braintrust/eval/evaluator_test.rb
index e6268363..a590039b 100644
--- a/test/braintrust/eval/evaluator_test.rb
+++ b/test/braintrust/eval/evaluator_test.rb
@@ -77,7 +77,10 @@ def test_run_delegates_to_eval_run
   end
 
   def test_run_passes_on_progress
-    evaluator = Braintrust::Eval::Evaluator.new(task: ->(input:) { input })
+    evaluator = Braintrust::Eval::Evaluator.new(
+      task: ->(input:) { input },
+      scorers: [Braintrust::Scorer.new("noop") { 1.0 }]
+    )
 
     progress_events = []
     cases = [{input: "a"}, {input: "b"}]
diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb
index bd4b525a..28eb95a4 100644
--- a/test/braintrust/eval/runner_test.rb
+++ b/test/braintrust/eval/runner_test.rb
@@ -2065,3 +2065,272 @@ def test_runner_parameters_with_parallelism
     assert(params.all? { |p| p == {"model" => "gpt-4"} })
   end
 end
+
+class Braintrust::Eval::RunnerClassifierTest < Minitest::Test
+  def test_runner_with_classifiers_only
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("category") { |output:| {name: "category", id: "greeting", label: "Greeting"} }
+      ],
+      cases: [{input: "hello", expected: nil}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    assert_equal({}, result.scores)
+    assert_equal({"category" => [{id: "greeting", label: "Greeting"}]}, result.classifications)
+  end
+
+  def test_runner_with_scorers_and_classifiers
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [Braintrust::Scorer.new("exact") { |expected:, output:| (output == expected) ? 1.0 : 0.0 }],
+      classifiers: [
+        Braintrust::Classifier.new("category") { |**| {name: "category", id: "text"} }
+      ],
+      cases: [{input: "hello", expected: "HELLO"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    assert_equal [1.0], result.scores["exact"]
+    assert_equal({"category" => [{id: "text"}]}, result.classifications)
+  end
+
+  def test_runner_classifier_nil_return_produces_no_classifications
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("maybe") { |**| nil }
+      ],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    assert_nil result.classifications
+  end
+
+  def test_runner_classifier_error_does_not_abort_eval
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [Braintrust::Scorer.new("always_one") { 1.0 }],
+      classifiers: [
+        Braintrust::Classifier.new("broken") { |**| raise "classifier boom" }
+      ],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    # Eval succeeds — classifier errors don't add to errors queue
+    assert result.success?
+    assert_equal [1.0], result.scores["always_one"]
+    assert_nil result.classifications
+  end
+
+  def test_runner_classifier_error_does_not_affect_other_classifiers
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("broken") { |**| raise "boom" },
+        Braintrust::Classifier.new("working") { |**| {name: "working", id: "ok"} }
+      ],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    assert_equal({"working" => [{id: "ok"}]}, result.classifications)
+  end
+
+  def test_runner_classifier_error_logged_to_span_metadata
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("broken") { |**| raise "classifier boom" }
+      ],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    Braintrust::Eval::Runner.new(context).run
+    spans = rig.drain
+
+    eval_span = spans.find { |s| s.name == "eval" }
+    refute_nil eval_span
+    metadata = JSON.parse(eval_span.attributes["braintrust.metadata"] || "{}")
+    assert_equal "classifier boom", metadata.dig("classifier_errors", "broken")
+  end
+
+  def test_runner_classifier_span_attributes
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("my_classifier") { |**| {name: "my_classifier", id: "foo"} }
+      ],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    Braintrust::Eval::Runner.new(context).run
+    spans = rig.drain
+
+    classifier_span = spans.find { |s| s.name == "my_classifier" }
+    refute_nil classifier_span
+    span_attrs = JSON.parse(classifier_span.attributes["braintrust.span_attributes"])
+    assert_equal "classifier", span_attrs["type"]
+    assert_equal "scorer", span_attrs["purpose"]
+    assert_equal "my_classifier", span_attrs["name"]
+  end
+
+  def test_runner_classifier_multi_label_result
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("sentiment") do |**|
+          [
+            {name: "sentiment", id: "positive", label: "Positive"},
+            {name: "sentiment", id: "enthusiastic", label: "Enthusiastic"}
+          ]
+        end
+      ],
+      cases: [{input: "great!"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    sentiment = result.classifications["sentiment"]
+    assert_equal 2, sentiment.length
+    assert_equal({id: "positive", label: "Positive"}, sentiment[0])
+    assert_equal({id: "enthusiastic", label: "Enthusiastic"}, sentiment[1])
+  end
+
+  def test_runner_classifier_name_defaults_to_function_name
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("my_classifier") { |**| {id: "foo"} } # no :name in result
+      ],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    assert result.classifications.key?("my_classifier")
+  end
+
+  def test_runner_classifications_logged_to_eval_span
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("category") { |**| {name: "category", id: "greeting"} }
+      ],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    Braintrust::Eval::Runner.new(context).run
+    spans = rig.drain
+
+    eval_span = spans.find { |s| s.name == "eval" }
+    refute_nil eval_span
+    raw = eval_span.attributes["braintrust.classifications"]
+    refute_nil raw
+    classifications = JSON.parse(raw)
+    assert_equal [{"id" => "greeting"}], classifications["category"]
+  end
+
+  def test_runner_classifications_nil_when_no_classifiers
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [Braintrust::Scorer.new("exact") { 1.0 }],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    assert_nil result.classifications
+  end
+
+  def test_runner_multiple_cases_accumulate_classifications
+    rig = setup_otel_test_rig
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input },
+      scorers: [],
+      classifiers: [
+        Braintrust::Classifier.new("category") { |input:| {name: "category", id: (input.length > 3) ? "long" : "short"} }
+      ],
+      cases: [{input: "hi"}, {input: "hello"}, {input: "ok"}],
+      experiment_id: "exp-123",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    result = Braintrust::Eval::Runner.new(context).run
+    assert result.success?
+    category = result.classifications["category"]
+    assert_equal 3, category.length
+  end
+end
diff --git a/test/braintrust/eval_test.rb b/test/braintrust/eval_test.rb
index bef2a2cf..6a11d008 100644
--- a/test/braintrust/eval_test.rb
+++ b/test/braintrust/eval_test.rb
@@ -1384,4 +1384,52 @@ def test_eval_run_with_parameters
       end
     end
   end
+
+  # ============================================
+  # Classifier validation tests
+  # ============================================
+
+  def test_eval_run_requires_at_least_scorers_or_classifiers
+    error = assert_raises(ArgumentError) do
+      Braintrust::Eval.run(
+        cases: [{input: "hello"}],
+        task: ->(input:) { input }
+      )
+    end
+    assert_match(/at least one of scorers or classifiers is required/i, error.message)
+  end
+
+  def test_eval_run_requires_at_least_scorers_or_classifiers_when_empty_arrays
+    error = assert_raises(ArgumentError) do
+      Braintrust::Eval.run(
+        cases: [{input: "hello"}],
+        task: ->(input:) { input },
+        scorers: [],
+        classifiers: []
+      )
+    end
+    assert_match(/at least one of scorers or classifiers is required/i, error.message)
+  end
+
+  def test_eval_run_with_classifiers_only_no_scorers
+    rig = setup_otel_test_rig
+
+    result = run_test_eval(
+      experiment_id: "exp-123",
+      experiment_name: "classifier-only",
+      project_id: "proj-456",
+      project_name: "test-project",
+      cases: [{input: "hello"}],
+      task: ->(input:) { input },
+      classifiers: [
+        Braintrust::Classifier.new("category") { |**| {name: "category", id: "greeting"} }
+      ],
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+
+    assert result.success?
+    assert_equal({}, result.scores)
+    assert_equal({"category" => [{id: "greeting"}]}, result.classifications)
+  end
 end
diff --git a/test/support/braintrust_helper.rb b/test/support/braintrust_helper.rb
index 49d57eb1..8f355ee1 100644
--- a/test/support/braintrust_helper.rb
+++ b/test/support/braintrust_helper.rb
@@ -75,11 +75,13 @@ def get_integration_test_api(**options)
 
       # Helper to run eval internally without API calls for testing
       # @param state [State] Braintrust state
-      def run_test_eval(experiment_id:, experiment_name:, project_id:, project_name:,
-        cases:, task:, scorers:, state:, parallelism: 1, tracer_provider: nil)
+      def run_test_eval(experiment_id:, experiment_name:, project_id:,
+        project_name:, cases:, task:, state:, scorers: [], classifiers: [],
+        parallelism: 1, tracer_provider: nil)
         context = Braintrust::Eval::Context.build(
           task: task,
           scorers: scorers,
+          classifiers: classifiers,
           cases: cases,
           experiment_id: experiment_id,
           experiment_name: experiment_name,

From 4cd8c1047713c71a35fa282b9652f5f271fbd67b Mon Sep 17 00:00:00 2001
From: Stephen Belanger <admin@stephenbelanger.com>
Date: Wed, 22 Apr 2026 12:52:19 -0700
Subject: [PATCH 2/5] Add classifier example

---
 examples/eval/classifiers.rb | 144 +++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 examples/eval/classifiers.rb

diff --git a/examples/eval/classifiers.rb b/examples/eval/classifiers.rb
new file mode 100644
index 00000000..2d93d320
--- /dev/null
+++ b/examples/eval/classifiers.rb
@@ -0,0 +1,144 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require "bundler/setup"
+require "braintrust"
+require "opentelemetry/sdk"
+
+# Example: Classifiers
+#
+# Classifiers categorize and label eval outputs. Unlike scorers (which return
+# numeric 0-1 values), classifiers return structured Classification items —
+# each with an :id, an optional :label, and optional :metadata.
+#
+# Results are stored as a dictionary keyed by classifier name:
+#
+#   { "sentiment" => [{ id: "positive", label: "Positive" }] }
+#
+# Three patterns are shown:
+#
+#   1. Block-based (Braintrust::Classifier.new):
+#      Returns a single Classification hash. Good for concise, one-off classifiers.
+#
+#   2. Multi-label block-based:
+#      Returns an Array of Classification hashes — useful when a single
+#      classifier assigns multiple labels to the same output.
+#
+#   3. Class-based (include Braintrust::Classifier):
+#      Define a class with a #call method. Good for reusable classifiers
+#      that carry their own logic and state.
+#
+# Classifiers and scorers run independently. You can use both together, or
+# use only classifiers when you don't need numeric scores.
+#
+# Usage:
+#   bundle exec ruby examples/eval/classifiers.rb
+
+Braintrust.init
+
+# ---------------------------------------------------------------------------
+# Test cases: customer support messages
+# ---------------------------------------------------------------------------
+MESSAGES = [
+  {input: "Hi! I just wanted to say thank you, the product is amazing!"},
+  {input: "I've been waiting 2 weeks for my order. This is unacceptable!"},
+  {input: "How do I reset my password? I can't find the option anywhere."},
+  {input: "The item arrived damaged. I need a refund immediately."},
+  {input: "Just checking in — any update on my ticket #4821?"}
+]
+
+# ---------------------------------------------------------------------------
+# Simulated task: generate a support response (replace with a real LLM call)
+# ---------------------------------------------------------------------------
+def generate_response(message)
+  case message
+  when /thank/i then "You're welcome! So glad you're enjoying it."
+  when /waiting|order/i then "I sincerely apologise for the delay. Let me look into this right away."
+  when /password|reset/i then "To reset your password, go to Settings > Account > Reset Password."
+  when /damaged|refund/i then "I'm sorry to hear that. I'll process your refund immediately."
+  else "Thanks for reaching out! Let me check on that for you."
+  end
+end
+
+# ---------------------------------------------------------------------------
+# Pattern 1: block-based single-label classifier
+#
+# Classifies each message into a single intent category.
+# Declare only the kwargs you need — extras are filtered automatically.
+# ---------------------------------------------------------------------------
+intent_classifier = Braintrust::Classifier.new("intent") do |input:|
+  id = case input
+  when /thank/i then "praise"
+  when /waiting|order|update/i then "follow_up"
+  when /password|reset|find/i then "how_to"
+  when /damaged|refund/i then "complaint"
+  else "other"
+  end
+
+  {name: "intent", id: id, label: id.tr("_", " ").capitalize}
+end
+
+# ---------------------------------------------------------------------------
+# Pattern 2: block-based multi-label classifier
+#
+# A single classifier can return an Array to assign multiple labels.
+# All items sharing the same :name are grouped into the same results array.
+# ---------------------------------------------------------------------------
+tone_classifier = Braintrust::Classifier.new("tone") do |input:|
+  labels = []
+  labels << {name: "tone", id: "urgent", label: "Urgent"} if input.match?(/immediately|unacceptable|waiting/i)
+  labels << {name: "tone", id: "polite", label: "Polite"} if input.match?(/please|thank|just checking/i)
+  labels << {name: "tone", id: "frustrated", label: "Frustrated"} if input.match?(/unacceptable|damaged|waiting/i)
+  labels << {name: "tone", id: "neutral", label: "Neutral"} if labels.empty?
+  labels
+end
+
+# ---------------------------------------------------------------------------
+# Pattern 3: class-based classifier
+#
+# Include Braintrust::Classifier and define #call with keyword args.
+# The class name is snake_cased to derive the default classifier name
+# (ResponseQualityClassifier -> "response_quality_classifier").
+# Override #name to customise it.
+# ---------------------------------------------------------------------------
+class ResponseQualityClassifier
+  include Braintrust::Classifier
+
+  def name
+    "response_quality"
+  end
+
+  def call(input:, output:)
+    word_count = output.to_s.split.length
+
+    id = if output.to_s.strip.empty?
+      "no_response"
+    elsif word_count < 5
+      "too_short"
+    elsif output.match?(/immediately|right away|look into/i)
+      "action_oriented"
+    else
+      "informational"
+    end
+
+    {
+      name: "response_quality",
+      id: id,
+      label: id.tr("_", " ").capitalize,
+      metadata: {word_count: word_count}
+    }
+  end
+end
+
+# ---------------------------------------------------------------------------
+# Run the eval — classifiers only (no numeric scores needed here)
+# ---------------------------------------------------------------------------
+Braintrust::Eval.run(
+  project: "ruby-sdk-examples",
+  experiment: "classifiers-example",
+  cases: MESSAGES,
+  task: ->(input:) { generate_response(input) },
+  classifiers: [intent_classifier, tone_classifier, ResponseQualityClassifier.new]
+)
+
+OpenTelemetry.tracer_provider.shutdown

From d80d83c206317e47c4f05c0bf59eac6b93acdd7d Mon Sep 17 00:00:00 2001
From: Stephen Belanger <admin@stephenbelanger.com>
Date: Wed, 22 Apr 2026 15:12:06 -0700
Subject: [PATCH 3/5] Fix server test evaluators to satisfy scorers/classifiers
 validation

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 test/braintrust/contrib/rails/server/eval_controller_test.rb | 1 +
 test/braintrust/server/handlers/eval_test.rb                 | 1 +
 test/braintrust/server/rack/eval_endpoint_test.rb            | 1 +
 test/braintrust/server/services/eval_service_test.rb         | 1 +
 4 files changed, 4 insertions(+)

diff --git a/test/braintrust/contrib/rails/server/eval_controller_test.rb b/test/braintrust/contrib/rails/server/eval_controller_test.rb
index 8eaaa54e..c7f2245c 100644
--- a/test/braintrust/contrib/rails/server/eval_controller_test.rb
+++ b/test/braintrust/contrib/rails/server/eval_controller_test.rb
@@ -159,6 +159,7 @@ def test_returns_401_when_auth_fails
           private
 
           def test_evaluator(**kwargs)
+            kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
             Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
           end
 
diff --git a/test/braintrust/server/handlers/eval_test.rb b/test/braintrust/server/handlers/eval_test.rb
index dce8a868..b93360dd 100644
--- a/test/braintrust/server/handlers/eval_test.rb
+++ b/test/braintrust/server/handlers/eval_test.rb
@@ -491,6 +491,7 @@ def test_handler_passes_parent_through
         private
 
         def test_evaluator(**kwargs)
+          kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
           Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
         end
 
diff --git a/test/braintrust/server/rack/eval_endpoint_test.rb b/test/braintrust/server/rack/eval_endpoint_test.rb
index a443e066..22d90654 100644
--- a/test/braintrust/server/rack/eval_endpoint_test.rb
+++ b/test/braintrust/server/rack/eval_endpoint_test.rb
@@ -197,6 +197,7 @@ def test_rejects_get
         private
 
         def test_evaluator(**kwargs)
+          kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
           Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
         end
 
diff --git a/test/braintrust/server/services/eval_service_test.rb b/test/braintrust/server/services/eval_service_test.rb
index 5320d5f7..b327103d 100644
--- a/test/braintrust/server/services/eval_service_test.rb
+++ b/test/braintrust/server/services/eval_service_test.rb
@@ -327,6 +327,7 @@ def test_build_state_evicts_oldest_when_cache_full
         private
 
         def test_evaluator(**kwargs)
+          kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
           Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
         end
 

From dcca411d75ff912471651bba114d929278ceb884 Mon Sep 17 00:00:00 2001
From: Stephen Belanger <admin@stephenbelanger.com>
Date: Wed, 22 Apr 2026 22:30:53 -0700
Subject: [PATCH 4/5] Fix server tests to explicitly declare scorers rather
 than defaulting via helper

Reverts the blanket `kwargs[:scorers] ||= [...]` added in d80d83c.
Each test that actually runs an eval now explicitly passes `scorers: [noop_scorer]`
where needed. Tests that only exercise validation paths (400/404) are unaffected.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../rails/server/eval_controller_test.rb      | 11 ++++---
 test/braintrust/server/handlers/eval_test.rb  | 29 ++++++++++---------
 .../server/rack/eval_endpoint_test.rb         | 15 ++++++----
 .../server/services/eval_service_test.rb      |  9 ++++--
 4 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/test/braintrust/contrib/rails/server/eval_controller_test.rb b/test/braintrust/contrib/rails/server/eval_controller_test.rb
index c7f2245c..93bd8ad0 100644
--- a/test/braintrust/contrib/rails/server/eval_controller_test.rb
+++ b/test/braintrust/contrib/rails/server/eval_controller_test.rb
@@ -24,7 +24,7 @@ def app
           end
 
           def test_streams_sse_events_for_inline_data
-            @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase })
+            @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer])
             reset_engine!(evaluators: @evaluators, auth: :none)
 
             post_json "/eval", {
@@ -53,7 +53,7 @@ def test_streams_sse_events_for_inline_data
           end
 
           def test_progress_events_contain_output
-            @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase })
+            @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer])
             reset_engine!(evaluators: @evaluators, auth: :none)
 
             post_json "/eval", {
@@ -94,7 +94,7 @@ def test_summary_event_contains_scores
           end
 
           def test_error_still_emits_progress_and_done
-            @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "task exploded" })
+            @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "task exploded" }, scorers: [noop_scorer])
             reset_engine!(evaluators: @evaluators, auth: :none)
 
             post_json "/eval", {
@@ -159,10 +159,13 @@ def test_returns_401_when_auth_fails
           private
 
           def test_evaluator(**kwargs)
-            kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
             Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
           end
 
+          def noop_scorer
+            Braintrust::Scorer.new("noop") { 1.0 }
+          end
+
           def post_json(path, body)
             post path, JSON.generate(body), {"CONTENT_TYPE" => "application/json"}
           end
diff --git a/test/braintrust/server/handlers/eval_test.rb b/test/braintrust/server/handlers/eval_test.rb
index b93360dd..884d5199 100644
--- a/test/braintrust/server/handlers/eval_test.rb
+++ b/test/braintrust/server/handlers/eval_test.rb
@@ -59,7 +59,7 @@ def test_returns_400_for_multiple_data_sources
         # --- SSE streaming ---
 
         def test_returns_200_with_sse_content_type
-          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input })
+          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer])
 
           status, headers, _ = handler.call(rack_json_env(
             {name: "test-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"},
@@ -73,7 +73,7 @@ def test_returns_200_with_sse_content_type
         end
 
         def test_streams_progress_event_per_case
-          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase })
+          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {name: "upcase-eval", data: {data: [{input: "a"}, {input: "b"}, {input: "c"}]}, experiment_name: "exp"},
@@ -88,7 +88,7 @@ def test_streams_progress_event_per_case
         end
 
         def test_progress_event_contains_protocol_fields
-          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase })
+          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {name: "upcase-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"},
@@ -108,7 +108,7 @@ def test_progress_event_contains_protocol_fields
         end
 
         def test_progress_event_contains_task_output_as_json_string
-          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase })
+          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {name: "upcase-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"},
@@ -165,7 +165,7 @@ def test_summary_event_contains_scores_and_experiment_name
         end
 
         def test_stream_ends_with_done
-          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input })
+          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"},
@@ -177,7 +177,7 @@ def test_stream_ends_with_done
         end
 
         def test_task_error_still_emits_progress_and_done
-          @evaluators["failing-eval"] = test_evaluator(task: -> { raise "boom" })
+          @evaluators["failing-eval"] = test_evaluator(task: -> { raise "boom" }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {name: "failing-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"},
@@ -191,7 +191,7 @@ def test_task_error_still_emits_progress_and_done
         end
 
         def test_task_error_progress_contains_error_event
-          @evaluators["failing-eval"] = test_evaluator(task: -> { raise "task exploded" })
+          @evaluators["failing-eval"] = test_evaluator(task: -> { raise "task exploded" }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {name: "failing-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"},
@@ -239,7 +239,7 @@ def test_accepts_dataset_id_as_sole_data_source
         # --- Auth passthrough ---
 
         def test_build_state_returns_nil_without_auth
-          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input })
+          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer])
 
           env = rack_json_env(
             {name: "test-eval", data: {data: [{input: "hello"}]}},
@@ -252,7 +252,7 @@ def test_build_state_returns_nil_without_auth
         end
 
         def test_build_state_returns_nil_for_non_hash_auth
-          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input })
+          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer])
 
           env = rack_json_env(
             {name: "test-eval", data: {data: [{input: "hello"}]}},
@@ -442,7 +442,7 @@ def test_handler_resolves_scores_to_scorer_ids
         # --- Server-specific body selection ---
 
         def test_returns_sse_body_without_protocol_http_request
-          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input })
+          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"},
@@ -453,7 +453,7 @@ def test_returns_sse_body_without_protocol_http_request
         end
 
         def test_returns_sse_stream_body_with_protocol_http_request
-          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input })
+          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer])
 
           env = rack_json_env(
             {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"},
@@ -470,7 +470,7 @@ def test_returns_sse_stream_body_with_protocol_http_request
         # --- Parent passthrough ---
 
         def test_handler_passes_parent_through
-          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input })
+          @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer])
 
           _, _, body = handler.call(rack_json_env(
             {
@@ -491,10 +491,13 @@ def test_handler_passes_parent_through
         private
 
         def test_evaluator(**kwargs)
-          kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
           Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
         end
 
+        def noop_scorer
+          Braintrust::Scorer.new("noop") { 1.0 }
+        end
+
         def handler
           Braintrust::Server::Handlers::Eval.new(@evaluators)
         end
diff --git a/test/braintrust/server/rack/eval_endpoint_test.rb b/test/braintrust/server/rack/eval_endpoint_test.rb
index 22d90654..8e943ffe 100644
--- a/test/braintrust/server/rack/eval_endpoint_test.rb
+++ b/test/braintrust/server/rack/eval_endpoint_test.rb
@@ -21,7 +21,7 @@ def app
         end
 
         def test_streams_sse_events_for_inline_data
-          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase })
+          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer])
 
           post_json "/eval", {
             name: "upcase-eval",
@@ -52,7 +52,7 @@ def test_streams_sse_events_for_inline_data
         end
 
         def test_progress_events_contain_output
-          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase })
+          @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer])
 
           post_json "/eval", {
             name: "upcase-eval",
@@ -94,7 +94,8 @@ def test_summary_event_contains_scores
 
         def test_error_still_emits_progress_and_done
           @evaluators["failing-eval"] = test_evaluator(
-            task: -> { raise "task exploded" }
+            task: -> { raise "task exploded" },
+            scorers: [noop_scorer]
           )
 
           post_json "/eval", {
@@ -167,7 +168,8 @@ def test_parameters_forwarded_to_task
             task: ->(input:, parameters:) {
               prefix = parameters["greeting"] || "hey"
               "#{prefix} #{input}"
-            }
+            },
+            scorers: [noop_scorer]
           )
 
           post_json "/eval", {
@@ -197,10 +199,13 @@ def test_rejects_get
         private
 
         def test_evaluator(**kwargs)
-          kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
           Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
         end
 
+        def noop_scorer
+          Braintrust::Scorer.new("noop") { 1.0 }
+        end
+
         def post_json(path, body)
           post path, JSON.generate(body), {"CONTENT_TYPE" => "application/json"}
         end
diff --git a/test/braintrust/server/services/eval_service_test.rb b/test/braintrust/server/services/eval_service_test.rb
index b327103d..4f5a32d1 100644
--- a/test/braintrust/server/services/eval_service_test.rb
+++ b/test/braintrust/server/services/eval_service_test.rb
@@ -92,7 +92,7 @@ def test_validate_accepts_dataset_name
         # --- stream ---
 
         def test_stream_emits_progress_and_done_events
-          @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase })
+          @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer])
           s = service
           validated = s.validate({
             "name" => "upcase-eval",
@@ -129,7 +129,7 @@ def test_stream_emits_summary_with_scores
         end
 
         def test_stream_emits_error_progress_on_task_failure
-          @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "boom" })
+          @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "boom" }, scorers: [noop_scorer])
           s = service
           validated = s.validate({
             "name" => "failing-eval",
@@ -327,10 +327,13 @@ def test_build_state_evicts_oldest_when_cache_full
         private
 
         def test_evaluator(**kwargs)
-          kwargs[:scorers] ||= [Braintrust::Scorer.new("noop") { 1.0 }]
           Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs)
         end
 
+        def noop_scorer
+          Braintrust::Scorer.new("noop") { 1.0 }
+        end
+
         def collect_streamed_events(svc, validated, auth: nil)
           chunks = []
           sse = Braintrust::Server::SSEWriter.new { |chunk| chunks << chunk }

From 541e6e6cb13421267961d7ef867b383e3d57db46 Mon Sep 17 00:00:00 2001
From: Stephen Belanger <stephenbelanger@s-belanger.localdomain>
Date: Tue, 5 May 2026 15:26:56 -0700
Subject: [PATCH 5/5] Surface classifier errors via result.errors

Mirror the scorer error pattern by pushing classifier failures onto the
errors queue, so they appear in result.errors (and result.success? is
false) in addition to being recorded in span metadata.

Addresses PR #154 review feedback.
---
 lib/braintrust/eval/runner.rb       |  3 +++
 test/braintrust/eval/runner_test.rb | 10 +++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb
index 0cd249d8..074415ff 100644
--- a/lib/braintrust/eval/runner.rb
+++ b/lib/braintrust/eval/runner.rb
@@ -128,6 +128,9 @@ def run_eval_case(kase, errors)
             existing_metadata = kase.metadata || {}
             classifier_errors_metadata = existing_metadata.merge(classifier_errors: classifier_errors)
             set_json_attr(eval_span, "braintrust.metadata", classifier_errors_metadata)
+            classifier_errors.each do |classifier_name, message|
+              errors << "Classifier '#{classifier_name}' failed for input '#{kase.input}': #{message}"
+            end
           end
 
           # Set output after task completes
diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb
index 28eb95a4..7b021236 100644
--- a/test/braintrust/eval/runner_test.rb
+++ b/test/braintrust/eval/runner_test.rb
@@ -2145,8 +2145,10 @@ def test_runner_classifier_error_does_not_abort_eval
     )
 
     result = Braintrust::Eval::Runner.new(context).run
-    # Eval succeeds — classifier errors don't add to errors queue
-    assert result.success?
+    # Eval continues running task and scorers, but classifier errors are surfaced.
+    refute result.success?
+    assert_equal 1, result.errors.length
+    assert_match(/Classifier 'broken' failed for input 'hello': classifier boom/, result.errors.first)
     assert_equal [1.0], result.scores["always_one"]
     assert_nil result.classifications
   end
@@ -2168,7 +2170,9 @@ def test_runner_classifier_error_does_not_affect_other_classifiers
     )
 
     result = Braintrust::Eval::Runner.new(context).run
-    assert result.success?
+    refute result.success?
+    assert_equal 1, result.errors.length
+    assert_match(/Classifier 'broken' failed for input 'hello': boom/, result.errors.first)
     assert_equal({"working" => [{id: "ok"}]}, result.classifications)
   end