From 49d0aa1daaea3288dbb9501477e90612b3a1e418 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@fb.com>
Date: Wed, 15 Apr 2026 00:52:56 -0700
Subject: [PATCH 1/2] Add Qwen 3.5 MoE to cuda-perf CI and add prefill
 throughput tracking

- Add PyTorchObserver stats output to qwen3_5_moe runner (enables
  cuda_benchmark.py parsing), --prompt_file flag, and GPU memory stats
- Add prefill_throughput metric to cuda_benchmark.py (prefill tok/s
  alongside existing decode tok/s)
- Add Qwen3.5-35B-A3B-HQQ-INT4 to cuda-perf.yml with >1000 token
  prompt and 512 output tokens, on linux.aws.a100
- Align cuda-perf.yml triggers with cuda.yml (push main/release,
  ciflow/cuda tags, PR on backends/cuda and backends/aoti paths)
- Remove random model selection and schedule trigger; always run all
  models when triggered
---
 .ci/scripts/cuda_benchmark.py        | 32 +++++++++-
 .github/workflows/cuda-perf.yml      | 93 ++++++++++------------------
 examples/models/qwen3_5_moe/main.cpp | 66 ++++++++++++++++----
 3 files changed, 119 insertions(+), 72 deletions(-)

diff --git a/.ci/scripts/cuda_benchmark.py b/.ci/scripts/cuda_benchmark.py
index b135925d4b4..eb511674645 100644
--- a/.ci/scripts/cuda_benchmark.py
+++ b/.ci/scripts/cuda_benchmark.py
@@ -18,7 +18,9 @@ class RunMetrics:
     """Metrics from a single run."""
 
     generated_tokens: int
+    prompt_tokens: int
     tokens_per_sec: float
+    prefill_tokens_per_sec: float
     model_load_time_ms: float
     total_inference_time_ms: float
     encoder_time_ms: float
@@ -28,7 +30,8 @@ class RunMetrics:
     def __repr__(self):
         return (
             f"Tokens: {self.generated_tokens}, "
-            f"Throughput: {self.tokens_per_sec:.2f} t/s, "
+            f"Prefill: {self.prefill_tokens_per_sec:.2f} t/s ({self.prompt_tokens} tokens), "
+            f"Decode: {self.tokens_per_sec:.2f} t/s, "
             f"Model load: {self.model_load_time_ms:.0f}ms, "
             f"Total inference: {self.total_inference_time_ms:.0f}ms, "
             f"Encoder: {self.encoder_time_ms:.0f}ms, "
@@ -49,6 +52,7 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
 
         # Extract values
         generated_tokens = data.get("generated_tokens", 0)
+        prompt_tokens = data.get("prompt_tokens", 0)
         inference_start_ms = data.get("inference_start_ms", 0)
         inference_end_ms = data.get("inference_end_ms", 0)
         prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
@@ -72,12 +76,20 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
             if generation_time_ms > 0
             else 0
         )
+
+        # Calculate prefill throughput
+        prefill_tokens_per_sec = (
+            (prompt_tokens / encoder_time_ms * 1000) if encoder_time_ms > 0 else 0
+        )
+
         model_load_time_ms = model_load_end_ms - model_load_start_ms
         first_token_latency_ms = first_token_ms - prompt_eval_end_ms
 
         return RunMetrics(
             generated_tokens=generated_tokens,
+            prompt_tokens=prompt_tokens,
             tokens_per_sec=tokens_per_sec,
+            prefill_tokens_per_sec=prefill_tokens_per_sec,
             model_load_time_ms=model_load_time_ms,
             total_inference_time_ms=total_inference_time_ms,
             encoder_time_ms=encoder_time_ms,
@@ -505,6 +517,7 @@ class BenchmarkResults:
 
     # Metrics
     throughput: MetricStats
+    prefill_throughput: MetricStats
     model_load_time: MetricStats
     total_inference_time: MetricStats
     encoder_time: MetricStats
@@ -529,6 +542,10 @@ def to_dict(self) -> dict:
             "throughput_min": self.throughput.min_val,
             "throughput_max": self.throughput.max_val,
             "throughput_stdev": self.throughput.stdev,
+            "prefill_throughput_mean": self.prefill_throughput.mean,
+            "prefill_throughput_min": self.prefill_throughput.min_val,
+            "prefill_throughput_max": self.prefill_throughput.max_val,
+            "prefill_throughput_stdev": self.prefill_throughput.stdev,
             "model_load_time_mean": self.model_load_time.mean,
             "model_load_time_min": self.model_load_time.min_val,
             "model_load_time_max": self.model_load_time.max_val,
@@ -601,6 +618,13 @@ def to_v3_format(
                 runner_type,
                 base_extra_info,
             ),
+            self.prefill_throughput.create_v3_record(
+                model_name_with_quant,
+                backend,
+                runner_name,
+                runner_type,
+                base_extra_info,
+            ),
             self.model_load_time.create_v3_record(
                 model_name_with_quant,
                 backend,
@@ -696,6 +720,11 @@ def create_metric_stats(
             "t/s",
             {"trimmed_runs": len(trimmed_throughput)},
         ),
+        prefill_throughput=create_metric_stats(
+            "prefill_throughput(tokens/sec)",
+            [r.prefill_tokens_per_sec for r in results],
+            "t/s",
+        ),
         model_load_time=create_metric_stats(
             "model_load_time(ms)",
             [r.model_load_time_ms for r in results],
@@ -740,6 +769,7 @@ def print_summary(summary: BenchmarkResults) -> None:
 
     # Print all metrics using their print_stats method
     summary.throughput.print_stats()
+    summary.prefill_throughput.print_stats()
     summary.model_load_time.print_stats()
     summary.total_inference_time.print_stats()
     summary.encoder_time.print_stats()
diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index c6846a61a44..5c01da00e04 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -1,47 +1,39 @@
 name: cuda-perf
 
 on:
-  schedule:
-    - cron: 0 8 * * *  # 12am / 1am PST (8am UTC)
-  pull_request:
-    paths:
-      - .github/workflows/cuda-perf.yml
-      - .ci/scripts/cuda_benchmark.py
-      - .ci/scripts/export_model_artifact.sh
-      - .ci/scripts/test_model_e2e.sh
   push:
     branches:
       - main
+      - release/*
+    tags:
+      - ciflow/cuda/*
+  pull_request:
     paths:
       - .github/workflows/cuda-perf.yml
+      - .github/workflows/cuda.yml
       - .ci/scripts/cuda_benchmark.py
       - .ci/scripts/export_model_artifact.sh
       - .ci/scripts/test_model_e2e.sh
+      - backends/cuda/**
+      - backends/aoti/**
   workflow_dispatch:
     inputs:
       models:
         description: Models to be benchmarked (comma-separated HuggingFace model IDs)
         required: false
         type: string
-        default: openai/whisper-small
       quantizations:
         description: Quantization types (comma-separated)
         required: false
         type: string
-        default: non-quantized
       num_runs:
         description: Number of benchmark runs per model
         required: false
         type: string
         default: "50"
-      run_all_models:
-        description: Run all available models (overrides models input)
-        required: false
-        type: boolean
-        default: false
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
 jobs:
@@ -60,48 +52,20 @@ jobs:
         id: set-parameters
         shell: bash
         env:
-          # All available models and quantizations
-          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt'
+          ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt,SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4'
           ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only'
           NUM_RUNS: ${{ inputs.num_runs || '50' }}
-          RUN_ALL_MODELS: ${{ inputs.run_all_models || 'false' }}
-          RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || 'false' }}
         run: |
           set -eux
 
           MODELS="${{ inputs.models }}"
           QUANTIZATIONS="${{ inputs.quantizations }}"
 
-          # If run_all_models is true, use all models
-          if [ "$RUN_ALL_MODELS" = "true" ]; then
-            MODELS="$ALL_MODELS"
-            echo "Running all available models: $MODELS"
-          # For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization
-          elif [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
-            # Split all models into array
-            IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS"
-            # Randomly select one model
-            RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]}))
-            MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}"
-            echo "Randomly selected model for PR/push: $MODELS"
-          elif [ -z "$MODELS" ]; then
-            # Schedule event: use all models
+          # Use all models/quantizations unless overridden by workflow_dispatch
+          if [ -z "$MODELS" ]; then
             MODELS="$ALL_MODELS"
           fi
-
-          # If run_all_models is true, use all quantizations
-          if [ "$RUN_ALL_MODELS" = "true" ]; then
-            QUANTIZATIONS="$ALL_QUANTIZATIONS"
-            echo "Running all available quantizations: $QUANTIZATIONS"
-          elif [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then
-            # Split all quantizations into array
-            IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS"
-            # Randomly select one quantization
-            RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]}))
-            QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}"
-            echo "Randomly selected quantization for PR/push: $QUANTIZATIONS"
-          elif [ -z "$QUANTIZATIONS" ]; then
-            # Schedule event: use all quantizations
+          if [ -z "$QUANTIZATIONS" ]; then
             QUANTIZATIONS="$ALL_QUANTIZATIONS"
           fi
 
@@ -109,19 +73,15 @@ jobs:
           IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
           IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS"
 
-          # If random model is requested (for main branch push), select one random model from the already selected models
-          if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then
-            RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]}))
-            MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}"
-            MODEL_ARRAY=("$MODELS")
-            echo "Random model selected for main branch push: $MODELS"
-          fi
-
-          # Generate benchmark configs
+          # Generate benchmark configs (skip invalid model/quant combinations)
           CONFIGS='{"include":['
           FIRST=true
           for MODEL in "${MODEL_ARRAY[@]}"; do
             for QUANT in "${QUANT_ARRAY[@]}"; do
+              # Qwen3.5 MoE only supports quantized-int4-tile-packed
+              if [[ "$MODEL" == *"Qwen3.5-35B-A3B"* ]] && [ "$QUANT" != "quantized-int4-tile-packed" ]; then
+                continue
+              fi
               if [ "$FIRST" = true ]; then
                 FIRST=false
               else
@@ -152,7 +112,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: linux.g5.4xlarge.nvidia.gpu
+      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: "12.6"
       use-custom-docker-registry: false
@@ -162,6 +122,8 @@ jobs:
       script: |
         set -eux
         echo "::group::Setup ExecuTorch"
+        # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
+        export USE_MKL=OFF
         ./install_executorch.sh
         echo "::endgroup::"
 
@@ -198,7 +160,7 @@ jobs:
       fail-fast: false
     with:
       timeout: 90
-      runner: linux.g5.4xlarge.nvidia.gpu
+      runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
       gpu-arch-version: "12.6"
       use-custom-docker-registry: false
@@ -299,6 +261,19 @@ jobs:
             RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER"
             MODEL_NAME="parakeet_${{ matrix.quant }}"
             ;;
+          SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
+            RUNNER="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+            TOKENIZER="model_artifacts/tokenizer.json"
+            # Generate a >1000 token prompt for benchmarking
+            python3 -c "
+        text = 'The quick brown fox jumps over the lazy dog. ' * 200
+        prompt = '<|im_start|>user\nPlease analyze and summarize the following text in detail:\n\n' + text + '\n<|im_end|>\n<|im_start|>assistant\n'
+        with open('model_artifacts/long_prompt.txt', 'w') as f:
+            f.write(prompt)
+        "
+            RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --prompt_file model_artifacts/long_prompt.txt --max_new_tokens 512 --temperature 0"
+            MODEL_NAME="qwen3_5_moe_${{ matrix.quant }}"
+            ;;
           *)
             echo "Error: Unsupported model '${{ matrix.model }}'"
             exit 1
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
index 7f4e60596be..91c7a16f834 100644
--- a/examples/models/qwen3_5_moe/main.cpp
+++ b/examples/models/qwen3_5_moe/main.cpp
@@ -16,7 +16,7 @@
 #include <executorch/runtime/platform/log.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 
-#include <chrono>
+#include <fstream>
 #include <string>
 #include <vector>
 
@@ -26,6 +26,10 @@ DEFINE_string(model_path, "", "Model .pte file path.");
 DEFINE_string(data_path, "", "Data file (.ptd) for CUDA backend.");
 DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
 DEFINE_string(prompt, "Hello", "Prompt text.");
+DEFINE_string(
+    prompt_file,
+    "",
+    "Path to file containing prompt text (overrides --prompt).");
 DEFINE_double(temperature, 0.8, "Sampling temperature (0 = greedy).");
 DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
 
@@ -50,6 +54,16 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  llm::Stats stats;
+
+  // GPU memory before load
+  size_t gpu_free_bytes, gpu_total_bytes;
+  cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
+  stats.gpu_total_bytes = gpu_total_bytes;
+  stats.gpu_free_before_load_bytes = gpu_free_bytes;
+
+  stats.model_load_start_ms = llm::time_in_ms();
+
   // Load tokenizer
   auto tokenizer = std::make_unique<tokenizers::HFTokenizer>();
   auto tok_status = tokenizer->load(FLAGS_tokenizer_path);
@@ -109,11 +123,30 @@ int main(int argc, char** argv) {
     }
   }
 
+  stats.model_load_end_ms = llm::time_in_ms();
+
+  // GPU memory after load
+  cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
+  stats.gpu_free_after_load_bytes = gpu_free_bytes;
+
   // Get EOS ids
   auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get());
 
+  // Read prompt from file or flag
+  std::string prompt_text = FLAGS_prompt;
+  if (!FLAGS_prompt_file.empty()) {
+    std::ifstream f(FLAGS_prompt_file);
+    if (!f.is_open()) {
+      ET_LOG(
+          Error, "Failed to open prompt file: %s", FLAGS_prompt_file.c_str());
+      return 1;
+    }
+    prompt_text = std::string(
+        (std::istreambuf_iterator<char>(f)), std::istreambuf_iterator<char>());
+  }
+
   // Encode prompt
-  auto encode_result = tokenizer->encode(FLAGS_prompt);
+  auto encode_result = tokenizer->encode(prompt_text);
   if (!encode_result.ok()) {
     ET_LOG(Error, "Failed to encode prompt");
     return 1;
@@ -122,13 +155,15 @@ int main(int argc, char** argv) {
   int64_t num_prompt_tokens = prompt_tokens.size();
   printf("Prompt tokens: %ld\n", num_prompt_tokens);
 
+  stats.num_prompt_tokens = num_prompt_tokens;
+  stats.inference_start_ms = llm::time_in_ms();
+
   // ---------------------------------------------------------------
   // Prefill or decode-only
   // ---------------------------------------------------------------
   auto S = [](int64_t v) -> SizesType { return static_cast<SizesType>(v); };
 
   uint64_t cur_token = 0;
-  auto prefill_start = std::chrono::steady_clock::now();
 
   // Chunked prefill
   std::vector<int64_t> pos_data(num_prompt_tokens);
@@ -161,10 +196,11 @@ int main(int argc, char** argv) {
       std::make_shared<executorch::aten::Tensor>(std::move(logits_tensor));
   cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature);
 
-  auto prefill_end = std::chrono::steady_clock::now();
+  stats.prompt_eval_end_ms = llm::time_in_ms();
+  stats.first_token_ms = stats.prompt_eval_end_ms;
+
   double prefill_ms =
-      std::chrono::duration<double, std::milli>(prefill_end - prefill_start)
-          .count();
+      (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
   printf(
       "Prefill: %ld tokens in %.1f ms (%.1f tok/s)\n",
       num_prompt_tokens,
@@ -184,7 +220,6 @@ int main(int argc, char** argv) {
   // ---------------------------------------------------------------
   // Decode — generate tokens one at a time
   // ---------------------------------------------------------------
-  llm::Stats stats;
   int64_t pos = num_prompt_tokens;
   uint64_t prev_token;
 
@@ -195,8 +230,6 @@ int main(int argc, char** argv) {
   auto decode_pos = from_blob(
       decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long);
 
-  auto decode_start = std::chrono::steady_clock::now();
-
   for (int32_t step = 0; step < FLAGS_max_new_tokens; step++) {
     decode_token_data[0] = static_cast<int64_t>(cur_token);
     decode_pos_data[0] = pos;
@@ -235,13 +268,14 @@ int main(int argc, char** argv) {
     }
   }
 
-  auto decode_end = std::chrono::steady_clock::now();
+  stats.inference_end_ms = llm::time_in_ms();
 
   printf("\n");
   int64_t num_generated = pos - num_prompt_tokens;
+  stats.num_generated_tokens = num_generated;
+
   double decode_ms =
-      std::chrono::duration<double, std::milli>(decode_end - decode_start)
-          .count();
+      (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
   printf(
       "Decode: %ld tokens in %.1f ms (%.1f tok/s)\n",
       num_generated,
@@ -249,5 +283,13 @@ int main(int argc, char** argv) {
       num_generated * 1000.0 / decode_ms);
   printf("Prompt tokens: %ld\n", num_prompt_tokens);
 
+  // GPU memory after generation
+  cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
+  stats.gpu_free_after_generate_bytes = gpu_free_bytes;
+  stats.gpu_peak_usage_mb =
+      (stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0;
+
+  llm::print_report(stats);
+
   return 0;
 }

From 36213f90c5f7d42ba1a5f7a5e187ba7f2c160b96 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 15 Apr 2026 14:49:01 -0700
Subject: [PATCH 2/2] fix 0 first token

---
 examples/models/qwen3_5_moe/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
index 91c7a16f834..168fdc8f395 100644
--- a/examples/models/qwen3_5_moe/main.cpp
+++ b/examples/models/qwen3_5_moe/main.cpp
@@ -197,7 +197,6 @@ int main(int argc, char** argv) {
   cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature);
 
   stats.prompt_eval_end_ms = llm::time_in_ms();
-  stats.first_token_ms = stats.prompt_eval_end_ms;
 
   double prefill_ms =
       (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
@@ -254,6 +253,10 @@ int main(int argc, char** argv) {
     cur_token = llm::logits_to_token(*step_logits_ptr, FLAGS_temperature);
     stats.on_sampling_end();
 
+    if (step == 0) {
+      stats.first_token_ms = llm::time_in_ms();
+    }
+
     pos++;
 
     auto decode_str = tokenizer->decode(prev_token, cur_token);