diff --git a/examples/eval/classifiers.rb b/examples/eval/classifiers.rb new file mode 100644 index 00000000..2d93d320 --- /dev/null +++ b/examples/eval/classifiers.rb @@ -0,0 +1,144 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "braintrust" +require "opentelemetry/sdk" + +# Example: Classifiers +# +# Classifiers categorize and label eval outputs. Unlike scorers (which return +# numeric 0-1 values), classifiers return structured Classification items — +# each with an :id, an optional :label, and optional :metadata. +# +# Results are stored as a dictionary keyed by classifier name: +# +# { "sentiment" => [{ id: "positive", label: "Positive" }] } +# +# Three patterns are shown: +# +# 1. Block-based (Braintrust::Classifier.new): +# Returns a single Classification hash. Good for concise, one-off classifiers. +# +# 2. Multi-label block-based: +# Returns an Array of Classification hashes — useful when a single +# classifier assigns multiple labels to the same output. +# +# 3. Class-based (include Braintrust::Classifier): +# Define a class with a #call method. Good for reusable classifiers +# that carry their own logic and state. +# +# Classifiers and scorers run independently. You can use both together, or +# use only classifiers when you don't need numeric scores. +# +# Usage: +# bundle exec ruby examples/eval/classifiers.rb + +Braintrust.init + +# --------------------------------------------------------------------------- +# Test cases: customer support messages +# --------------------------------------------------------------------------- +MESSAGES = [ + {input: "Hi! I just wanted to say thank you, the product is amazing!"}, + {input: "I've been waiting 2 weeks for my order. This is unacceptable!"}, + {input: "How do I reset my password? I can't find the option anywhere."}, + {input: "The item arrived damaged. I need a refund immediately."}, + {input: "Just checking in — any update on my ticket #4821?"} +] + +# --------------------------------------------------------------------------- +# Simulated task: generate a support response (replace with a real LLM call) +# --------------------------------------------------------------------------- +def generate_response(message) + case message + when /thank/i then "You're welcome! So glad you're enjoying it." + when /waiting|order/i then "I sincerely apologise for the delay. Let me look into this right away." + when /password|reset/i then "To reset your password, go to Settings > Account > Reset Password." + when /damaged|refund/i then "I'm sorry to hear that. I'll process your refund immediately." + else "Thanks for reaching out! Let me check on that for you." + end +end + +# --------------------------------------------------------------------------- +# Pattern 1: block-based single-label classifier +# +# Classifies each message into a single intent category. +# Declare only the kwargs you need — extras are filtered automatically. +# --------------------------------------------------------------------------- +intent_classifier = Braintrust::Classifier.new("intent") do |input:| + id = case input + when /thank/i then "praise" + when /waiting|order|update/i then "follow_up" + when /password|reset|find/i then "how_to" + when /damaged|refund/i then "complaint" + else "other" + end + + {name: "intent", id: id, label: id.tr("_", " ").capitalize} +end + +# --------------------------------------------------------------------------- +# Pattern 2: block-based multi-label classifier +# +# A single classifier can return an Array to assign multiple labels. +# All items sharing the same :name are grouped into the same results array. +# --------------------------------------------------------------------------- +tone_classifier = Braintrust::Classifier.new("tone") do |input:| + labels = [] + labels << {name: "tone", id: "urgent", label: "Urgent"} if input.match?(/immediately|unacceptable|waiting/i) + labels << {name: "tone", id: "polite", label: "Polite"} if input.match?(/please|thank|just checking/i) + labels << {name: "tone", id: "frustrated", label: "Frustrated"} if input.match?(/unacceptable|damaged|waiting/i) + labels << {name: "tone", id: "neutral", label: "Neutral"} if labels.empty? + labels +end + +# --------------------------------------------------------------------------- +# Pattern 3: class-based classifier +# +# Include Braintrust::Classifier and define #call with keyword args. +# The class name is snake_cased to derive the default classifier name +# (ResponseQualityClassifier -> "response_quality_classifier"). +# Override #name to customise it. +# --------------------------------------------------------------------------- +class ResponseQualityClassifier + include Braintrust::Classifier + + def name + "response_quality" + end + + def call(input:, output:) + word_count = output.to_s.split.length + + id = if output.to_s.strip.empty? + "no_response" + elsif word_count < 5 + "too_short" + elsif output.match?(/immediately|right away|look into/i) + "action_oriented" + else + "informational" + end + + { + name: "response_quality", + id: id, + label: id.tr("_", " ").capitalize, + metadata: {word_count: word_count} + } + end +end + +# --------------------------------------------------------------------------- +# Run the eval — classifiers only (no numeric scores needed here) +# --------------------------------------------------------------------------- +Braintrust::Eval.run( + project: "ruby-sdk-examples", + experiment: "classifiers-example", + cases: MESSAGES, + task: ->(input:) { generate_response(input) }, + classifiers: [intent_classifier, tone_classifier, ResponseQualityClassifier.new] +) + +OpenTelemetry.tracer_provider.shutdown diff --git a/lib/braintrust/classifier.rb b/lib/braintrust/classifier.rb new file mode 100644 index 00000000..089de302 --- /dev/null +++ b/lib/braintrust/classifier.rb @@ -0,0 +1,157 @@ +# frozen_string_literal: true + +require_relative "internal/callable" + +module Braintrust + # Classifier wraps a classification function that categorizes and labels eval outputs. + # + # Unlike scorers (which return numeric 0-1 values), classifiers return structured + # {Classification} items with an id and optional label and metadata. + # + # Use inline with a block (keyword args): + # classifier = Classifier.new("category") { |output:| {name: "category", id: "greeting", label: "Greeting"} } + # + # Or include in a class and define #call with keyword args: + # class CategoryClassifier + # include Braintrust::Classifier + # + # def call(output:) + # {name: "category", id: "greeting", label: "Greeting"} + # end + # end + # + # Classifiers may return a single Classification hash, an Array of them, or nil + # (meaning no classifications for this case). + module Classifier + DEFAULT_NAME = "classifier" + + # @param base [Class] the class including Classifier + def self.included(base) + base.include(Callable) + end + + # Create a block-based classifier. + # + # @param name [String, nil] optional name (defaults to "classifier") + # @param block [Proc] the classification implementation; declare only the keyword + # args you need. Extra kwargs are filtered out automatically. + # + # Supported kwargs: +input:+, +expected:+, +output:+, +metadata:+, +trace:+, +parameters:+ + # @return [Classifier::Block] + # @raise [ArgumentError] if the block has unsupported arity + def self.new(name = nil, &block) + Block.new(name: name || DEFAULT_NAME, &block) + end + + # Included into classes that +include Classifier+. Prepends KeywordFilter and + # ClassificationNormalizer so #call receives only declared kwargs and always returns + # Array. Also provides a default #name and #call_parameters. + module Callable + # Normalizes the raw return value of #call into Array. + # Nested inside Callable because it depends on #name which Callable provides. + module ClassificationNormalizer + # @return [Array] normalized classification hashes with :name, :id, and optional :label, :metadata keys + def call(**kwargs) + normalize_classification_result(super) + end + + private + + # @param result [Hash, Array, nil] raw return value from #call + # @return [Array] zero or more classification hashes with :name, :id keys + # @raise [ArgumentError] if any item is not a non-empty object + def normalize_classification_result(result) + case result + when nil then [] + when Array then result.map { |item| normalize_classification_item(item) } + when Hash then [normalize_classification_item(result)] + else + raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{result.inspect}" + end + end + + # Fills in missing :name from the classifier, validates :id. + # @param item [Hash] a classification hash + # @return [Hash] the item with :name defaulted and validated + # @raise [ArgumentError] if item is not a non-empty Hash + def normalize_classification_item(item) + unless item.is_a?(Hash) && !item.empty? + raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{item.inspect}" + end + + # :name defaults to the classifier's resolved name when missing, empty, or non-string + unless item[:name].is_a?(String) && !item[:name].empty? + item = item.merge(name: name) + end + + item + end + end + + # Infrastructure modules prepended onto every classifier class. + # Used both to set up the ancestor chain and to skip past them in + # #call_parameters so KeywordFilter sees the real call signature. + PREPENDED = [Internal::Callable::KeywordFilter, ClassificationNormalizer].freeze + + # @param base [Class] the class including Callable + def self.included(base) + PREPENDED.each { |mod| base.prepend(mod) } + end + + # Default name derived from the class name (e.g. CategoryClassifier -> "category_classifier"). + # @return [String] + def name + klass = self.class.name&.split("::")&.last + return Classifier::DEFAULT_NAME unless klass + klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase + end + + # Provides KeywordFilter with the actual call signature of the subclass. + # Walks past PREPENDED modules in the ancestor chain so that user-defined + # #call keyword params are correctly introspected. + # Block overrides this to point directly at @block.parameters. + # @return [Array] parameter list + def call_parameters + meth = method(:call) + meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner) + meth.parameters + end + end + + # Block-based classifier. Stores a Proc and delegates #call to it. + # Includes Classifier so it satisfies +Classifier ===+ checks. + # Exposes #call_parameters so KeywordFilter can introspect the block's + # declared kwargs rather than Block#call's **kwargs signature. + class Block + include Classifier + + # @return [String] + attr_reader :name + + # @param name [String] classifier name + # @param block [Proc] classification implementation; must use keyword args or zero-arity + # @raise [ArgumentError] if the block uses positional params + def initialize(name: DEFAULT_NAME, &block) + @name = name + params = block.parameters + unless Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0 + raise ArgumentError, "Classifier block must use keyword args (got arity #{block.arity})" + end + @block = block + end + + # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter) + # @return [Array] normalized classification results + def call(**kwargs) + @block.call(**kwargs) + end + + # Exposes the block's parameter list so KeywordFilter can filter + # kwargs to match the block's declared keywords. + # @return [Array] parameter list from Proc#parameters + def call_parameters + @block.parameters + end + end + end +end diff --git a/lib/braintrust/eval.rb b/lib/braintrust/eval.rb index d661eedc..ac27b48b 100644 --- a/lib/braintrust/eval.rb +++ b/lib/braintrust/eval.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require_relative "classifier" require_relative "scorer" require_relative "task" require_relative "functions" @@ -160,7 +161,10 @@ def scorer(name, callable = nil, &block) # - String: dataset name (fetches from same project) # - Hash: {name:, id:, project:, version:, limit:} # @param task [#call] The task to evaluate (must be callable) - # @param scorers [Array] The scorers to use (String names, Scorer objects, or callables) + # @param scorers [Array, nil] The scorers to use (String names, Scorer objects, or callables). + # At least one of scorers or classifiers must be provided. + # @param classifiers [Array, nil] The classifiers to use. + # At least one of scorers or classifiers must be provided. # @param on_progress [#call, nil] Optional callback fired after each test case. # Receives a Hash: {"data" => output, "scores" => {name => value}} on success, # or {"error" => message} on failure. @@ -177,13 +181,16 @@ def scorer(name, callable = nil, &block) # @param parent [Hash, nil] Parent span context ({object_type:, object_id:, generation:}) # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument # @return [Result] - def run(task:, scorers:, project: nil, experiment: nil, - cases: nil, dataset: nil, on_progress: nil, + def run(task:, scorers: nil, classifiers: nil, project: nil, + experiment: nil, cases: nil, dataset: nil, on_progress: nil, parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false, state: nil, tracer_provider: nil, project_id: nil, parent: nil, parameters: nil) # Validate required parameters - validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset) + validate_params!(task: task, scorers: scorers, + classifiers: classifiers, cases: cases, dataset: dataset) + scorers ||= [] + classifiers ||= [] experiment_id = nil project_name = project @@ -216,6 +223,7 @@ def run(task:, scorers:, project: nil, experiment: nil, context = Context.build( task: task, scorers: scorers, + classifiers: classifiers, cases: cases, experiment_id: experiment_id, experiment_name: experiment, @@ -245,9 +253,19 @@ def print_result(result) # Validate required parameters # @raise [ArgumentError] if validation fails - def validate_params!(task:, scorers:, cases:, dataset:) + def validate_params!(task:, scorers:, classifiers:, cases:, dataset:) raise ArgumentError, "task is required" unless task - raise ArgumentError, "scorers is required" unless scorers + + # Validate task is callable before anything else + unless task.respond_to?(:call) + raise ArgumentError, "task must be callable (respond to :call)" + end + + has_scorers = scorers && !scorers.empty? + has_classifiers = classifiers && !classifiers.empty? + unless has_scorers || has_classifiers + raise ArgumentError, "at least one of scorers or classifiers is required" + end # Validate cases and dataset are mutually exclusive if cases && dataset @@ -258,11 +276,6 @@ def validate_params!(task:, scorers:, cases:, dataset:) unless cases || dataset raise ArgumentError, "must specify either 'cases' or 'dataset'" end - - # Validate task is callable - unless task.respond_to?(:call) - raise ArgumentError, "task must be callable (respond to :call)" - end end # Resolve project by name or ID. Creates if needed. diff --git a/lib/braintrust/eval/context.rb b/lib/braintrust/eval/context.rb index ebcca050..02717267 100644 --- a/lib/braintrust/eval/context.rb +++ b/lib/braintrust/eval/context.rb @@ -1,18 +1,20 @@ # frozen_string_literal: true require_relative "cases" +require_relative "../classifier" module Braintrust module Eval # Holds all normalized, ready-to-execute eval components. # Use Context.build to construct from raw user inputs. class Context - attr_reader :task, :scorers, :cases, :experiment_id, :experiment_name, - :project_id, :project_name, :state, :tracer_provider, + attr_reader :task, :scorers, :classifiers, :cases, :experiment_id, + :experiment_name, :project_id, :project_name, :state, :tracer_provider, :on_progress, :parent_span_attr, :generation, :parameters # @param task [Task] Normalized task wrapper # @param scorers [Array] Normalized scorer wrappers + # @param classifiers [Array] Normalized classifier wrappers # @param cases [Cases] Normalized eval cases # @param experiment_id [String, nil] Experiment ID for logging and trace linkage # @param experiment_name [String, nil] Experiment name, included in span attributes @@ -24,11 +26,13 @@ class Context # @param parent_span_attr [String, nil] Formatted parent span identifier ("type:id"), linking spans to a parent context # @param generation [Integer, nil] Generation number from the parent span context, used to link spans in a trace hierarchy # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument - def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, - project_id: nil, project_name: nil, state: nil, tracer_provider: nil, - on_progress: nil, parent_span_attr: nil, generation: nil, parameters: nil) + def initialize(task:, scorers:, cases:, classifiers: [], + experiment_id: nil, experiment_name: nil, project_id: nil, + project_name: nil, state: nil, tracer_provider: nil, on_progress: nil, + parent_span_attr: nil, generation: nil, parameters: nil) @task = task @scorers = scorers + @classifiers = classifiers @cases = cases @experiment_id = experiment_id @experiment_name = experiment_name @@ -46,6 +50,7 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil # Delegates to Factory for normalization. # @param task [Task, Proc, #call] Task to evaluate; wrapped into a {Task} if needed # @param scorers [Array] Scorers; each is normalized into a {Scorer} + # @param classifiers [Array] Classifiers; each is normalized into a {Classifier} # @param cases [Cases, Array, Enumerable] Eval cases; wrapped into {Cases} if needed # @param experiment_id [String, nil] Experiment ID for logging # @param experiment_name [String, nil] Experiment name, included in span attributes @@ -57,14 +62,15 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation # @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument # @return [Context] - def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, - project_id: nil, project_name: nil, state: nil, tracer_provider: nil, - on_progress: nil, parent: nil, parameters: nil) + def self.build(task:, scorers:, cases:, classifiers: [], + experiment_id: nil, experiment_name: nil, project_id: nil, + project_name: nil, state: nil, tracer_provider: nil, on_progress: nil, + parent: nil, parameters: nil) Factory.new( state: state, tracer_provider: tracer_provider, project_id: project_id, project_name: project_name ).build( - task: task, scorers: scorers, cases: cases, + task: task, scorers: scorers, classifiers: classifiers, cases: cases, experiment_id: experiment_id, experiment_name: experiment_name, on_progress: on_progress, parent: parent, parameters: parameters ) @@ -86,17 +92,19 @@ def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: # Normalize raw inputs and construct a {Context}. # @param task [Task, Proc, #call] Raw task # @param scorers [Array] Raw scorers + # @param classifiers [Array] Raw classifiers # @param cases [Cases, Array, Enumerable] Raw eval cases # @param experiment_id [String, nil] # @param experiment_name [String, nil] # @param on_progress [Proc, nil] # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation # @return [Context] - def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, - on_progress: nil, parent: nil, parameters: nil) + def build(task:, scorers:, cases:, classifiers: [], experiment_id: nil, + experiment_name: nil, on_progress: nil, parent: nil, parameters: nil) Context.new( task: normalize_task(task), scorers: normalize_scorers(scorers), + classifiers: normalize_classifiers(classifiers), cases: normalize_cases(cases), experiment_id: experiment_id, experiment_name: experiment_name, @@ -188,6 +196,23 @@ def normalize_scorers(raw) end end end + + # @param raw [Array] + # @return [Array] + def normalize_classifiers(raw) + raw.map do |classifier| + case classifier + when Braintrust::Classifier + classifier + when Proc + # Pass Proc/Lambda directly to preserve keyword arg info + Braintrust::Classifier.new(&classifier) + else + name = classifier.respond_to?(:name) ? classifier.name : nil + Braintrust::Classifier.new(name, &classifier.method(:call)) + end + end + end end end end diff --git a/lib/braintrust/eval/evaluator.rb b/lib/braintrust/eval/evaluator.rb index a5f135bc..6798f31a 100644 --- a/lib/braintrust/eval/evaluator.rb +++ b/lib/braintrust/eval/evaluator.rb @@ -40,11 +40,12 @@ module Eval # } # ) class Evaluator - attr_accessor :task, :scorers, :parameters + attr_accessor :task, :scorers, :classifiers, :parameters - def initialize(task: nil, scorers: [], parameters: {}) + def initialize(task: nil, scorers: [], classifiers: [], parameters: {}) @task = task @scorers = scorers + @classifiers = classifiers @parameters = parameters end @@ -68,6 +69,7 @@ def validate! # @param project_id [String, nil] Project UUID (skips project creation) # @param dataset [String, Hash, Dataset, Dataset::ID, nil] Dataset to fetch # @param scorers [Array, nil] Additional scorers (merged with evaluator's own) + # @param classifiers [Array, nil] Additional classifiers (merged with evaluator's own) # @param parent [Hash, nil] Parent span context # @param state [State, nil] Braintrust state # @param update [Boolean] If true, allow reusing existing experiment (default: false) @@ -75,16 +77,19 @@ def validate! # @return [Result] def run(cases, on_progress: nil, quiet: false, project: nil, experiment: nil, project_id: nil, - dataset: nil, scorers: nil, parent: nil, + dataset: nil, scorers: nil, classifiers: nil, parent: nil, state: nil, update: false, tracer_provider: nil, parameters: nil) all_scorers = scorers ? self.scorers + scorers : self.scorers + all_classifiers = classifiers ? + self.classifiers + classifiers : + self.classifiers Braintrust::Eval.run( - task: task, scorers: all_scorers, cases: cases, dataset: dataset, - project: project, experiment: experiment, project_id: project_id, - parent: parent, on_progress: on_progress, quiet: quiet, - state: state, update: update, tracer_provider: tracer_provider, - parameters: parameters + task: task, scorers: all_scorers, classifiers: all_classifiers, + cases: cases, dataset: dataset, project: project, + experiment: experiment, project_id: project_id, parent: parent, + on_progress: on_progress, quiet: quiet, state: state, update: update, + tracer_provider: tracer_provider, parameters: parameters ) end end diff --git a/lib/braintrust/eval/result.rb b/lib/braintrust/eval/result.rb index c18af302..7af7132f 100644 --- a/lib/braintrust/eval/result.rb +++ b/lib/braintrust/eval/result.rb @@ -9,7 +9,7 @@ module Eval # Contains experiment metadata, errors, timing information, and raw score data class Result attr_reader :experiment_id, :experiment_name, :project_id, :project_name, - :permalink, :errors, :duration, :scores + :permalink, :errors, :duration, :scores, :classifications # Create a new result # @param experiment_id [String] The experiment ID @@ -20,8 +20,9 @@ class Result # @param errors [Array] List of errors that occurred # @param duration [Float] Duration in seconds # @param scores [Hash, nil] Raw score data { scorer_name => Array } + # @param classifications [Hash, nil] Classification results { name => Array }, nil when no classifiers ran def initialize(experiment_id:, experiment_name:, project_id:, project_name:, - permalink:, errors:, duration:, scores: nil) + permalink:, errors:, duration:, scores: nil, classifications: nil) @experiment_id = experiment_id @experiment_name = experiment_name @project_id = project_id @@ -30,6 +31,7 @@ def initialize(experiment_id:, experiment_name:, project_id:, project_name:, @errors = errors @duration = duration @scores = scores + @classifications = classifications end # Check if the evaluation was successful (no errors) diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index f461e041..074415ff 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -27,8 +27,9 @@ def initialize(eval_context) @eval_context = eval_context @tracer = eval_context.tracer_provider.tracer("braintrust-eval") - # Mutex for thread-safe score collection + # Mutexes for thread-safe result collection @score_mutex = Mutex.new + @classification_mutex = Mutex.new end # Run evaluation and return Result @@ -39,6 +40,7 @@ def run(parallelism: 1) eval_cases = eval_context.cases errors = Queue.new @scores = {} # Reset for each run: { scorer_name => Array } + @classifications = {} # Reset for each run: { classifier_name => Array } if parallelism && parallelism > 1 Internal::ThreadPool.each(eval_cases, parallelism: parallelism) do |eval_case| @@ -69,7 +71,8 @@ def run(parallelism: 1) permalink: permalink, errors: error_array, duration: duration, - scores: @scores + scores: @scores, + classifications: @classifications.empty? ? nil : @classifications ) end @@ -119,6 +122,17 @@ def run_eval_case(kase, errors) errors << "Scorers failed for input '#{kase.input}': #{e.message}" end + # Run classifiers (independent of scorers; errors do not abort eval) + classifier_errors = run_classifiers(kase, eval_span) + unless classifier_errors.empty? + existing_metadata = kase.metadata || {} + classifier_errors_metadata = existing_metadata.merge(classifier_errors: classifier_errors) + set_json_attr(eval_span, "braintrust.metadata", classifier_errors_metadata) + classifier_errors.each do |classifier_name, message| + errors << "Classifier '#{classifier_name}' failed for input '#{kase.input}': #{message}" + end + end + # Set output after task completes set_json_attr(eval_span, "braintrust.output_json", {output: kase.output}) @@ -318,6 +332,104 @@ def collect_scores(score_results) score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] } end end + + # Run all classifiers for a case. Classifier errors are non-fatal and stored in metadata. + # @param kase [CaseContext] The per-case context (output must be populated) + # @param eval_span [OpenTelemetry::Trace::Span] The eval span for this case + # @return [Hash] classifier_errors map (name -> error message), empty if no errors + def run_classifiers(kase, eval_span) + return {} if eval_context.classifiers.empty? + + classifier_kwargs = { + input: kase.input, + expected: kase.expected, + output: kase.output, + metadata: kase.metadata || {}, + trace: kase.trace, + parameters: eval_context.parameters || {} + } + classifier_input = { + input: kase.input, + expected: kase.expected, + output: kase.output, + metadata: kase.metadata || {}, + parameters: eval_context.parameters || {} + } + + case_classifications = {} + classifier_errors = {} + + eval_context.classifiers.each_with_index do |classifier, index| + classifier_name = classifier.name || "classifier_#{index}" + begin + results = run_classifier(classifier, classifier_kwargs, classifier_input) + results.each do |item| + item_name = item[:name] + classification_item = item.except(:name) + (case_classifications[item_name] ||= []) << classification_item + end + collect_classifications(results) + rescue => e + Braintrust::Log.warn("[Classifier] #{classifier_name} failed: #{e.message}") + classifier_errors[classifier_name] = e.message + end + end + + unless case_classifications.empty? + set_json_attr(eval_span, "braintrust.classifications", case_classifications) + end + + classifier_errors + end + + # Run a single classifier inside its own span. + # @param classifier [Classifier] The classifier to run + # @param classifier_kwargs [Hash] Keyword arguments for the classifier + # @param classifier_input [Hash] Input to log on the span + # @return [Array] Normalized classification results from the classifier + def run_classifier(classifier, classifier_kwargs, classifier_input) + tracer.in_span(classifier.name) do |classifier_span| + classifier_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr + set_json_attr(classifier_span, "braintrust.span_attributes", build_classifier_span_attributes(classifier.name)) + set_json_attr(classifier_span, "braintrust.input_json", classifier_input) + + classification_results = classifier.call(**classifier_kwargs) + + # Build output dict keyed by name -> array of items (for span logging) + output_by_name = {} + classification_results.each do |item| + (output_by_name[item[:name]] ||= []) << item.except(:name) + end + + set_json_attr(classifier_span, "braintrust.output_json", output_by_name) + + classification_results + rescue => e + record_span_error(classifier_span, e, "ClassifierError") + raise + end + end + + # Build span_attributes for a classifier span. + # @param classifier_name [String] The classifier name + # @return [Hash] + def build_classifier_span_attributes(classifier_name) + attrs = {type: "classifier", name: classifier_name, purpose: "scorer"} + attrs[:generation] = eval_context.generation if eval_context.generation + attrs + end + + # Collect classification results into the global accumulator (thread-safe). + # Converts Classification to ClassificationItem by dropping :name. + # @param classification_results [Array] Classification results from a classifier + def collect_classifications(classification_results) + @classification_mutex.synchronize do + classification_results.each do |item| + item_name = item[:name] + (@classifications[item_name] ||= []) << item.except(:name) + end + end + end end end end diff --git a/lib/braintrust/server/services/list_service.rb b/lib/braintrust/server/services/list_service.rb index 06bd7add..8c29c6d3 100644 --- a/lib/braintrust/server/services/list_service.rb +++ b/lib/braintrust/server/services/list_service.rb @@ -20,6 +20,11 @@ def call {"name" => scorer_name} end entry = {"scores" => scores} + classifiers = (evaluator.classifiers || []).each_with_index.map do |classifier, i| + classifier_name = classifier.respond_to?(:name) ? classifier.name : "classifier_#{i}" + {"name" => classifier_name} + end + entry["classifiers"] = classifiers unless classifiers.empty? params = serialize_parameters(evaluator.parameters) entry["parameters"] = params if params result[name] = entry diff --git a/test/braintrust/classifier_test.rb b/test/braintrust/classifier_test.rb new file mode 100644 index 00000000..8dfe4501 --- /dev/null +++ b/test/braintrust/classifier_test.rb @@ -0,0 +1,226 @@ +# frozen_string_literal: true + +require "test_helper" +require "braintrust/classifier" + +class Braintrust::ClassifierTest < Minitest::Test + # ============================================ + # Classifier.new with block (inline classifiers) + # ============================================ + + def test_classifier_with_kwargs_block + classifier = Braintrust::Classifier.new("category") do |output:, **| + {name: "category", id: "greeting", label: "Greeting"} + end + + assert_equal "category", classifier.name + result = classifier.call(input: "hello", expected: nil, output: "hello") + assert_equal [{name: "category", id: "greeting", label: "Greeting"}], result + end + + def test_classifier_with_subset_kwargs_filters_extra_keys + classifier = Braintrust::Classifier.new("category") do |output:| + {name: "category", id: "word"} + end + + result = classifier.call(input: "x", expected: nil, output: "hello", metadata: {}, tags: ["t"]) + assert_equal [{name: "category", id: "word"}], result + end + + def test_classifier_returns_nil_produces_empty_array + classifier = Braintrust::Classifier.new("maybe") { |**| nil } + assert_equal [], classifier.call(output: "hello") + end + + def test_classifier_returns_array_of_classifications + classifier = Braintrust::Classifier.new("sentiment") do |**| + [ + {name: "sentiment", id: "positive", label: "Positive"}, + {name: "sentiment", id: "enthusiastic", label: "Enthusiastic"} + ] + end + + result = classifier.call(output: "great!") + assert_equal 2, result.length + assert_equal({name: "sentiment", id: "positive", label: "Positive"}, result[0]) + assert_equal({name: "sentiment", id: "enthusiastic", label: "Enthusiastic"}, result[1]) + end + + def test_classifier_with_metadata + classifier = Braintrust::Classifier.new("category") do |**| + {name: "category", id: "greeting", label: "Greeting", metadata: {source: "unit-test"}} + end + + result = classifier.call(output: "hello") + assert_equal [{name: "category", id: "greeting", label: "Greeting", metadata: {source: "unit-test"}}], result + end + + # ============================================ + # Name defaulting + # ============================================ + + def test_name_defaults_to_classifier_function_name_when_missing + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {id: "foo"} # no :name key + } + + result = classifier.call(output: "x") + assert_equal "my_classifier", result[0][:name] + end + + def test_name_defaults_to_classifier_function_name_when_empty_string + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {name: "", id: "foo"} + } + + result = classifier.call(output: "x") + assert_equal "my_classifier", result[0][:name] + end + + def test_name_defaults_to_classifier_function_name_when_not_a_string + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {name: 42, id: "foo"} + } + + result = classifier.call(output: "x") + assert_equal "my_classifier", result[0][:name] + end + + def test_explicit_name_in_result_takes_precedence + classifier = Braintrust::Classifier.new("my_classifier") { |**| + {name: "override_name", id: "foo"} + } + + result = classifier.call(output: "x") + assert_equal "override_name", result[0][:name] + end + + # ============================================ + # Validation + # ============================================ + + def test_classifier_non_empty_object_validation_nil_item + classifier = Braintrust::Classifier.new("bad") { |**| [nil] } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + assert_match(/nil/, error.message) + end + + def test_classifier_non_empty_object_validation_empty_hash + classifier = Braintrust::Classifier.new("bad") { |**| {} } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + end + + def test_classifier_non_empty_object_validation_string_item + classifier = Braintrust::Classifier.new("bad") { |**| ["not-a-hash"] } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + end + + def test_classifier_non_empty_object_validation_non_hash_scalar + classifier = Braintrust::Classifier.new("bad") { |**| 42 } + + error = assert_raises(ArgumentError) do + classifier.call(output: "x") + end + assert_match(/each classification must be a non-empty object/, error.message) + end + + def test_classifier_positional_params_raises + error = assert_raises(ArgumentError) do + Braintrust::Classifier.new("bad") { |a, b| a } + end + + assert_match(/classifier block must use keyword args/i, error.message) + end + + # ============================================ + # Name detection + # ============================================ + + def test_classifier_name_defaults_to_classifier_for_base_class + classifier = Braintrust::Classifier.new { |**| {id: "x"} } + assert_equal "classifier", classifier.name + end + + def test_classifier_explicit_name_takes_precedence + classifier = Braintrust::Classifier.new("my_name") { |**| {id: "x"} } + assert_equal "my_name", classifier.name + end + + # ============================================ + # Subclass pattern + # ============================================ + + def test_subclass_with_call_override + klass = Class.new do + include Braintrust::Classifier + + def call(output:) + {name: "category", id: output.empty? ? "empty" : "nonempty"} + end + end + + classifier = klass.new + assert_kind_of Braintrust::Classifier, classifier + + result = classifier.call(input: "x", expected: nil, output: "hello") + assert_equal [{name: "category", id: "nonempty"}], result + + result2 = classifier.call(input: "x", expected: nil, output: "") + assert_equal [{name: "category", id: "empty"}], result2 + end + + def test_subclass_with_name_override + klass = Class.new do + include Braintrust::Classifier + + def name + "custom_classifier" + end + + def call(**) + {id: "foo"} + end + end + + classifier = klass.new + assert_equal "custom_classifier", classifier.name + end + + def test_subclass_name_derived_from_class_name + klass = Class.new do + include Braintrust::Classifier + + def call(**) + {id: "foo"} + end + end + + Braintrust.stub_const(:FuzzyMatchTestClassifier, klass) do + classifier = klass.new + assert_equal "fuzzy_match_test_classifier", classifier.name + end + end + + def test_subclass_without_call_raises_on_call + klass = Class.new do + include Braintrust::Classifier + end + classifier = klass.new + + assert_raises(NoMethodError) do + classifier.call(output: "x") + end + end +end diff --git a/test/braintrust/contrib/rails/server/eval_controller_test.rb b/test/braintrust/contrib/rails/server/eval_controller_test.rb index 8eaaa54e..93bd8ad0 100644 --- a/test/braintrust/contrib/rails/server/eval_controller_test.rb +++ b/test/braintrust/contrib/rails/server/eval_controller_test.rb @@ -24,7 +24,7 @@ def app end def test_streams_sse_events_for_inline_data - @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer]) reset_engine!(evaluators: @evaluators, auth: :none) post_json "/eval", { @@ -53,7 +53,7 @@ def test_streams_sse_events_for_inline_data end def test_progress_events_contain_output - @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer]) reset_engine!(evaluators: @evaluators, auth: :none) post_json "/eval", { @@ -94,7 +94,7 @@ def test_summary_event_contains_scores end def test_error_still_emits_progress_and_done - @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "task exploded" }) + @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "task exploded" }, scorers: [noop_scorer]) reset_engine!(evaluators: @evaluators, auth: :none) post_json "/eval", { @@ -162,6 +162,10 @@ def test_evaluator(**kwargs) Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def post_json(path, body) post path, JSON.generate(body), {"CONTENT_TYPE" => "application/json"} end diff --git a/test/braintrust/eval/evaluator_test.rb b/test/braintrust/eval/evaluator_test.rb index e6268363..a590039b 100644 --- a/test/braintrust/eval/evaluator_test.rb +++ b/test/braintrust/eval/evaluator_test.rb @@ -77,7 +77,10 @@ def test_run_delegates_to_eval_run end def test_run_passes_on_progress - evaluator = Braintrust::Eval::Evaluator.new(task: ->(input:) { input }) + evaluator = Braintrust::Eval::Evaluator.new( + task: ->(input:) { input }, + scorers: [Braintrust::Scorer.new("noop") { 1.0 }] + ) progress_events = [] cases = [{input: "a"}, {input: "b"}] diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb index bd4b525a..7b021236 100644 --- a/test/braintrust/eval/runner_test.rb +++ b/test/braintrust/eval/runner_test.rb @@ -2065,3 +2065,276 @@ def test_runner_parameters_with_parallelism assert(params.all? { |p| p == {"model" => "gpt-4"} }) end end + +class Braintrust::Eval::RunnerClassifierTest < Minitest::Test + def test_runner_with_classifiers_only + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("category") { |output:| {name: "category", id: "greeting", label: "Greeting"} } + ], + cases: [{input: "hello", expected: nil}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_equal({}, result.scores) + assert_equal({"category" => [{id: "greeting", label: "Greeting"}]}, result.classifications) + end + + def test_runner_with_scorers_and_classifiers + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { |expected:, output:| (output == expected) ? 1.0 : 0.0 }], + classifiers: [ + Braintrust::Classifier.new("category") { |**| {name: "category", id: "text"} } + ], + cases: [{input: "hello", expected: "HELLO"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_equal [1.0], result.scores["exact"] + assert_equal({"category" => [{id: "text"}]}, result.classifications) + end + + def test_runner_classifier_nil_return_produces_no_classifications + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("maybe") { |**| nil } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_nil result.classifications + end + + def test_runner_classifier_error_does_not_abort_eval + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [Braintrust::Scorer.new("always_one") { 1.0 }], + classifiers: [ + Braintrust::Classifier.new("broken") { |**| raise "classifier boom" } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + # Eval continues running task and scorers, but classifier errors are surfaced. + refute result.success? + assert_equal 1, result.errors.length + assert_match(/Classifier 'broken' failed for input 'hello': classifier boom/, result.errors.first) + assert_equal [1.0], result.scores["always_one"] + assert_nil result.classifications + end + + def test_runner_classifier_error_does_not_affect_other_classifiers + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("broken") { |**| raise "boom" }, + Braintrust::Classifier.new("working") { |**| {name: "working", id: "ok"} } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + refute result.success? + assert_equal 1, result.errors.length + assert_match(/Classifier 'broken' failed for input 'hello': boom/, result.errors.first) + assert_equal({"working" => [{id: "ok"}]}, result.classifications) + end + + def test_runner_classifier_error_logged_to_span_metadata + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("broken") { |**| raise "classifier boom" } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + Braintrust::Eval::Runner.new(context).run + spans = rig.drain + + eval_span = spans.find { |s| s.name == "eval" } + refute_nil eval_span + metadata = JSON.parse(eval_span.attributes["braintrust.metadata"] || "{}") + assert_equal "classifier boom", metadata.dig("classifier_errors", "broken") + end + + def test_runner_classifier_span_attributes + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("my_classifier") { |**| {name: "my_classifier", id: "foo"} } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + Braintrust::Eval::Runner.new(context).run + spans = rig.drain + + classifier_span = spans.find { |s| s.name == "my_classifier" } + refute_nil classifier_span + span_attrs = JSON.parse(classifier_span.attributes["braintrust.span_attributes"]) + assert_equal "classifier", span_attrs["type"] + assert_equal "scorer", span_attrs["purpose"] + assert_equal "my_classifier", span_attrs["name"] + end + + def test_runner_classifier_multi_label_result + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("sentiment") do |**| + [ + {name: "sentiment", id: "positive", label: "Positive"}, + {name: "sentiment", id: "enthusiastic", label: "Enthusiastic"} + ] + end + ], + cases: [{input: "great!"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + sentiment = result.classifications["sentiment"] + assert_equal 2, sentiment.length + assert_equal({id: "positive", label: "Positive"}, sentiment[0]) + assert_equal({id: "enthusiastic", label: "Enthusiastic"}, sentiment[1]) + end + + def test_runner_classifier_name_defaults_to_function_name + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("my_classifier") { |**| {id: "foo"} } # no :name in result + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert result.classifications.key?("my_classifier") + end + + def test_runner_classifications_logged_to_eval_span + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("category") { |**| {name: "category", id: "greeting"} } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + Braintrust::Eval::Runner.new(context).run + spans = rig.drain + + eval_span = spans.find { |s| s.name == "eval" } + refute_nil eval_span + raw = eval_span.attributes["braintrust.classifications"] + refute_nil raw + classifications = JSON.parse(raw) + assert_equal [{"id" => "greeting"}], classifications["category"] + end + + def test_runner_classifications_nil_when_no_classifiers + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + assert_nil result.classifications + end + + def test_runner_multiple_cases_accumulate_classifications + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + classifiers: [ + Braintrust::Classifier.new("category") { |input:| {name: "category", id: (input.length > 3) ? "long" : "short"} } + ], + cases: [{input: "hi"}, {input: "hello"}, {input: "ok"}], + experiment_id: "exp-123", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + result = Braintrust::Eval::Runner.new(context).run + assert result.success? + category = result.classifications["category"] + assert_equal 3, category.length + end +end diff --git a/test/braintrust/eval_test.rb b/test/braintrust/eval_test.rb index bef2a2cf..6a11d008 100644 --- a/test/braintrust/eval_test.rb +++ b/test/braintrust/eval_test.rb @@ -1384,4 +1384,52 @@ def test_eval_run_with_parameters end end end + + # ============================================ + # Classifier validation tests + # ============================================ + + def test_eval_run_requires_at_least_scorers_or_classifiers + error = assert_raises(ArgumentError) do + Braintrust::Eval.run( + cases: [{input: "hello"}], + task: ->(input:) { input } + ) + end + assert_match(/at least one of scorers or classifiers is required/i, error.message) + end + + def test_eval_run_requires_at_least_scorers_or_classifiers_when_empty_arrays + error = assert_raises(ArgumentError) do + Braintrust::Eval.run( + cases: [{input: "hello"}], + task: ->(input:) { input }, + scorers: [], + classifiers: [] + ) + end + assert_match(/at least one of scorers or classifiers is required/i, error.message) + end + + def test_eval_run_with_classifiers_only_no_scorers + rig = setup_otel_test_rig + + result = run_test_eval( + experiment_id: "exp-123", + experiment_name: "classifier-only", + project_id: "proj-456", + project_name: "test-project", + cases: [{input: "hello"}], + task: ->(input:) { input }, + classifiers: [ + Braintrust::Classifier.new("category") { |**| {name: "category", id: "greeting"} } + ], + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + assert result.success? + assert_equal({}, result.scores) + assert_equal({"category" => [{id: "greeting"}]}, result.classifications) + end end diff --git a/test/braintrust/server/handlers/eval_test.rb b/test/braintrust/server/handlers/eval_test.rb index dce8a868..884d5199 100644 --- a/test/braintrust/server/handlers/eval_test.rb +++ b/test/braintrust/server/handlers/eval_test.rb @@ -59,7 +59,7 @@ def test_returns_400_for_multiple_data_sources # --- SSE streaming --- def test_returns_200_with_sse_content_type - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) status, headers, _ = handler.call(rack_json_env( {name: "test-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"}, @@ -73,7 +73,7 @@ def test_returns_200_with_sse_content_type end def test_streams_progress_event_per_case - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "upcase-eval", data: {data: [{input: "a"}, {input: "b"}, {input: "c"}]}, experiment_name: "exp"}, @@ -88,7 +88,7 @@ def test_streams_progress_event_per_case end def test_progress_event_contains_protocol_fields - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "upcase-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"}, @@ -108,7 +108,7 @@ def test_progress_event_contains_protocol_fields end def test_progress_event_contains_task_output_as_json_string - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "upcase-eval", data: {data: [{input: "hello"}]}, experiment_name: "exp"}, @@ -165,7 +165,7 @@ def test_summary_event_contains_scores_and_experiment_name end def test_stream_ends_with_done - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -177,7 +177,7 @@ def test_stream_ends_with_done end def test_task_error_still_emits_progress_and_done - @evaluators["failing-eval"] = test_evaluator(task: -> { raise "boom" }) + @evaluators["failing-eval"] = test_evaluator(task: -> { raise "boom" }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "failing-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -191,7 +191,7 @@ def test_task_error_still_emits_progress_and_done end def test_task_error_progress_contains_error_event - @evaluators["failing-eval"] = test_evaluator(task: -> { raise "task exploded" }) + @evaluators["failing-eval"] = test_evaluator(task: -> { raise "task exploded" }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "failing-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -239,7 +239,7 @@ def test_accepts_dataset_id_as_sole_data_source # --- Auth passthrough --- def test_build_state_returns_nil_without_auth - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) env = rack_json_env( {name: "test-eval", data: {data: [{input: "hello"}]}}, @@ -252,7 +252,7 @@ def test_build_state_returns_nil_without_auth end def test_build_state_returns_nil_for_non_hash_auth - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) env = rack_json_env( {name: "test-eval", data: {data: [{input: "hello"}]}}, @@ -442,7 +442,7 @@ def test_handler_resolves_scores_to_scorer_ids # --- Server-specific body selection --- def test_returns_sse_body_without_protocol_http_request - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -453,7 +453,7 @@ def test_returns_sse_body_without_protocol_http_request end def test_returns_sse_stream_body_with_protocol_http_request - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) env = rack_json_env( {name: "test-eval", data: {data: [{input: "x"}]}, experiment_name: "exp"}, @@ -470,7 +470,7 @@ def test_returns_sse_stream_body_with_protocol_http_request # --- Parent passthrough --- def test_handler_passes_parent_through - @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }) + @evaluators["test-eval"] = test_evaluator(task: ->(input:) { input }, scorers: [noop_scorer]) _, _, body = handler.call(rack_json_env( { @@ -494,6 +494,10 @@ def test_evaluator(**kwargs) Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def handler Braintrust::Server::Handlers::Eval.new(@evaluators) end diff --git a/test/braintrust/server/rack/eval_endpoint_test.rb b/test/braintrust/server/rack/eval_endpoint_test.rb index a443e066..8e943ffe 100644 --- a/test/braintrust/server/rack/eval_endpoint_test.rb +++ b/test/braintrust/server/rack/eval_endpoint_test.rb @@ -21,7 +21,7 @@ def app end def test_streams_sse_events_for_inline_data - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) post_json "/eval", { name: "upcase-eval", @@ -52,7 +52,7 @@ def test_streams_sse_events_for_inline_data end def test_progress_events_contain_output - @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input:) { input.to_s.upcase }, scorers: [noop_scorer]) post_json "/eval", { name: "upcase-eval", @@ -94,7 +94,8 @@ def test_summary_event_contains_scores def test_error_still_emits_progress_and_done @evaluators["failing-eval"] = test_evaluator( - task: -> { raise "task exploded" } + task: -> { raise "task exploded" }, + scorers: [noop_scorer] ) post_json "/eval", { @@ -167,7 +168,8 @@ def test_parameters_forwarded_to_task task: ->(input:, parameters:) { prefix = parameters["greeting"] || "hey" "#{prefix} #{input}" - } + }, + scorers: [noop_scorer] ) post_json "/eval", { @@ -200,6 +202,10 @@ def test_evaluator(**kwargs) Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def post_json(path, body) post path, JSON.generate(body), {"CONTENT_TYPE" => "application/json"} end diff --git a/test/braintrust/server/services/eval_service_test.rb b/test/braintrust/server/services/eval_service_test.rb index 5320d5f7..4f5a32d1 100644 --- a/test/braintrust/server/services/eval_service_test.rb +++ b/test/braintrust/server/services/eval_service_test.rb @@ -92,7 +92,7 @@ def test_validate_accepts_dataset_name # --- stream --- def test_stream_emits_progress_and_done_events - @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }, scorers: [noop_scorer]) s = service validated = s.validate({ "name" => "upcase-eval", @@ -129,7 +129,7 @@ def test_stream_emits_summary_with_scores end def test_stream_emits_error_progress_on_task_failure - @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "boom" }) + @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "boom" }, scorers: [noop_scorer]) s = service validated = s.validate({ "name" => "failing-eval", @@ -330,6 +330,10 @@ def test_evaluator(**kwargs) Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) end + def noop_scorer + Braintrust::Scorer.new("noop") { 1.0 } + end + def collect_streamed_events(svc, validated, auth: nil) chunks = [] sse = Braintrust::Server::SSEWriter.new { |chunk| chunks << chunk } diff --git a/test/support/braintrust_helper.rb b/test/support/braintrust_helper.rb index 49d57eb1..8f355ee1 100644 --- a/test/support/braintrust_helper.rb +++ b/test/support/braintrust_helper.rb @@ -75,11 +75,13 @@ def get_integration_test_api(**options) # Helper to run eval internally without API calls for testing # @param state [State] Braintrust state - def run_test_eval(experiment_id:, experiment_name:, project_id:, project_name:, - cases:, task:, scorers:, state:, parallelism: 1, tracer_provider: nil) + def run_test_eval(experiment_id:, experiment_name:, project_id:, + project_name:, cases:, task:, state:, scorers: [], classifiers: [], + parallelism: 1, tracer_provider: nil) context = Braintrust::Eval::Context.build( task: task, scorers: scorers, + classifiers: classifiers, cases: cases, experiment_id: experiment_id, experiment_name: experiment_name,