pytorch · Gasoonjia · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/.ci/scripts/cuda_benchmark.py b/.ci/scripts/cuda_benchmark.py
@@ -18,7 +18,9 @@ class RunMetrics:
     """Metrics from a single run."""
 
     generated_tokens: int
+    prompt_tokens: int
     tokens_per_sec: float
+    prefill_tokens_per_sec: float
     model_load_time_ms: float
     total_inference_time_ms: float
     encoder_time_ms: float
@@ -28,7 +30,8 @@ class RunMetrics:
     def __repr__(self):
         return (
             f"Tokens: {self.generated_tokens}, "
-            f"Throughput: {self.tokens_per_sec:.2f} t/s, "
+            f"Prefill: {self.prefill_tokens_per_sec:.2f} t/s ({self.prompt_tokens} tokens), "
+            f"Decode: {self.tokens_per_sec:.2f} t/s, "
             f"Model load: {self.model_load_time_ms:.0f}ms, "
             f"Total inference: {self.total_inference_time_ms:.0f}ms, "
             f"Encoder: {self.encoder_time_ms:.0f}ms, "
@@ -49,6 +52,7 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
 
         # Extract values
         generated_tokens = data.get("generated_tokens", 0)
+        prompt_tokens = data.get("prompt_tokens", 0)
         inference_start_ms = data.get("inference_start_ms", 0)
         inference_end_ms = data.get("inference_end_ms", 0)
         prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
@@ -72,12 +76,20 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
             if generation_time_ms > 0
             else 0
         )
+
+        # Calculate prefill throughput
+        prefill_tokens_per_sec = (
+            (prompt_tokens / encoder_time_ms * 1000) if encoder_time_ms > 0 else 0
+        )
+
         model_load_time_ms = model_load_end_ms - model_load_start_ms
         first_token_latency_ms = first_token_ms - prompt_eval_end_ms
 
         return RunMetrics(
             generated_tokens=generated_tokens,
+            prompt_tokens=prompt_tokens,
             tokens_per_sec=tokens_per_sec,
+            prefill_tokens_per_sec=prefill_tokens_per_sec,
             model_load_time_ms=model_load_time_ms,
             total_inference_time_ms=total_inference_time_ms,
             encoder_time_ms=encoder_time_ms,
@@ -505,6 +517,7 @@ class BenchmarkResults:
 
     # Metrics
     throughput: MetricStats
+    prefill_throughput: MetricStats
     model_load_time: MetricStats
     total_inference_time: MetricStats
     encoder_time: MetricStats
@@ -529,6 +542,10 @@ def to_dict(self) -> dict:
             "throughput_min": self.throughput.min_val,
             "throughput_max": self.throughput.max_val,
             "throughput_stdev": self.throughput.stdev,
+            "prefill_throughput_mean": self.prefill_throughput.mean,
+            "prefill_throughput_min": self.prefill_throughput.min_val,
+            "prefill_throughput_max": self.prefill_throughput.max_val,
+            "prefill_throughput_stdev": self.prefill_throughput.stdev,
             "model_load_time_mean": self.model_load_time.mean,
             "model_load_time_min": self.model_load_time.min_val,
             "model_load_time_max": self.model_load_time.max_val,
@@ -601,6 +618,13 @@ def to_v3_format(
                 runner_type,
                 base_extra_info,
             ),
+            self.prefill_throughput.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
             self.model_load_time.create_v3_record(
                 model_name_with_quant,
                 backend,
@@ -696,6 +720,11 @@ def create_metric_stats(
             "t/s",
             {"trimmed_runs": len(trimmed_throughput)},
         ),
+        prefill_throughput=create_metric_stats(
+            "prefill_throughput(tokens/sec)",
+            [r.prefill_tokens_per_sec for r in results],
+            "t/s",
+        ),
         model_load_time=create_metric_stats(
             "model_load_time(ms)",
             [r.model_load_time_ms for r in results],
@@ -740,6 +769,7 @@ def print_summary(summary: BenchmarkResults) -> None:
 
     # Print all metrics using their print_stats method
     summary.throughput.print_stats()
+    summary.prefill_throughput.print_stats()
     summary.model_load_time.print_stats()
     summary.total_inference_time.print_stats()
     summary.encoder_time.print_stats()

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
@@ -1,47 +1,39 @@
 name: cuda-perf
 
 on:
-  schedule:
-    - cron: 0 8 * * *  # 12am / 1am PST (8am UTC)
-  pull_request:
-    paths:
-      - .github/workflows/cuda-perf.yml
-      - .ci/scripts/cuda_benchmark.py
-      - .ci/scripts/export_model_artifact.sh
-      - .ci/scripts/test_model_e2e.sh
   push:
     branches:
       - main
+      - release/*
+    tags:
+      - ciflow/cuda/*
+  pull_request:
     paths:
       - .github/workflows/cuda-perf.yml
+      - .github/workflows/cuda.yml
       - .ci/scripts/cuda_benchmark.py
       - .ci/scripts/export_model_artifact.sh
       - .ci/scripts/test_model_e2e.sh
+      - backends/cuda/**
+      - backends/aoti/**
   workflow_dispatch:
     inputs:
       models:
         description: Models to be benchmarked (comma-separated HuggingFace model IDs)
         required: false
         type: string
-        default: openai/whisper-small
       quantizations:
         description: Quantization types (comma-separated)
         required: false
         type: string
-        default: non-quantized
       num_runs:
         description: Number of benchmark runs per model
         required: false
         type: string
         default: "50"
-      run_all_models:
-        description: Run all available models (overrides models input)
-        required: false
-        type: boolean
-        default: false
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
@@ -60,68 +52,36 @@ jobs:
         id: set-parameters
         shell: bash
         env:
-          # All available models and quantizations
-          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt'
+          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt,SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4'
           ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
           NUM_RUNS: ${{ inputs.num_runs || '50' }}
-          RUN_ALL_MODELS: ${{ inputs.run_all_models || 'false' }}
-          RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || 'false' }}
         run: |
           set -eux
 
           MODELS="${{ inputs.models }}"
           QUANTIZATIONS="${{ inputs.quantizations }}"
 
-          # If run_all_models is true, use all models
-          if [ "$RUN_ALL_MODELS" = "true" ]; then
-            MODELS="$ALL_MODELS"
-            echo "Running all available models: $MODELS"
-          # For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization
-          elif [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
-            # Split all models into array
-            IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS"
-            # Randomly select one model
-            RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]}))
-            MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}"
-            echo "Randomly selected model for PR/push: $MODELS"
-          elif [ -z "$MODELS" ]; then
-            # Schedule event: use all models
+          # Use all models/quantizations unless overridden by workflow_dispatch
+          if [ -z "$MODELS" ]; then
             MODELS="$ALL_MODELS"
           fi
-
-          # If run_all_models is true, use all quantizations
-          if [ "$RUN_ALL_MODELS" = "true" ]; then
-            QUANTIZATIONS="$ALL_QUANTIZATIONS"
-            echo "Running all available quantizations: $QUANTIZATIONS"
-          elif [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
-            # Split all quantizations into array
-            IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS"
-            # Randomly select one quantization
-            RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]}))
-            QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}"
-            echo "Randomly selected quantization for PR/push: $QUANTIZATIONS"
-          elif [ -z "$QUANTIZATIONS" ]; then
-            # Schedule event: use all quantizations
+          if [ -z "$QUANTIZATIONS" ]; then
             QUANTIZATIONS="$ALL_QUANTIZATIONS"
           fi
 
           # Split models and quantizations into arrays
           IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
           IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"
 
-          # If random model is requested (for main branch push), select one random model from the already selected models
-          if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then
-            RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]}))
-            MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}"
-            MODEL_ARRAY=("$MODELS")
-            echo "Random model selected for main branch push: $MODELS"
-          fi
-
-          # Generate benchmark configs
+          # Generate benchmark configs (skip invalid model/quant combinations)
           CONFIGS='{"include":['
           FIRST=true
           for MODEL in "${MODEL_ARRAY[@]}"; do
             for QUANT in "${QUANT_ARRAY[@]}"; do
+              # Qwen3.5 MoE only supports quantized-int4-tile-packed
+              if [[ "$MODEL" == *"Qwen3.5-35B-A3B"* ]] && [ "$QUANT" != "quantized-int4-tile-packed" ]; then
+                continue
+              fi
               if [ "$FIRST" = true ]; then
                 FIRST=false
               else
@@ -152,7 +112,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: linux.g5.4xlarge.nvidia.gpu
+      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: "12.6"
       use-custom-docker-registry: false
@@ -162,6 +122,8 @@ jobs:
       script: |
         set -eux
         echo "::group::Setup ExecuTorch"
+        # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
+        export USE_MKL=OFF
         ./install_executorch.sh
         echo "::endgroup::"
 
@@ -198,7 +160,7 @@ jobs:
       fail-fast: false
     with:
       timeout: 90
-      runner: linux.g5.4xlarge.nvidia.gpu
+      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: "12.6"
       use-custom-docker-registry: false
@@ -299,6 +261,19 @@ jobs:
             RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER"
             MODEL_NAME="parakeet_${{ matrix.quant }}"
             ;;
+          SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+            RUNNER="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+            TOKENIZER="model_artifacts/tokenizer.json"
+            # Generate a >1000 token prompt for benchmarking
+            python3 -c "
+        text = 'The quick brown fox jumps over the lazy dog. ' * 200
+        prompt = '<|im_start|>user\nPlease analyze and summarize the following text in detail:\n\n' + text + '\n<|im_end|>\n<|im_start|>assistant\n'
+        with open('model_artifacts/long_prompt.txt', 'w') as f:
+            f.write(prompt)
+        "
+            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --prompt_file model_artifacts/long_prompt.txt --max_new_tokens 512 --temperature 0"
+            MODEL_NAME="qwen3_5_moe_${{ matrix.quant }}"
+            ;;
           *)
             echo "Error: Unsupported model '${{ matrix.model }}'"
             exit 1