From 49d0aa1daaea3288dbb9501477e90612b3a1e418 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 15 Apr 2026 00:52:56 -0700 Subject: [PATCH 1/2] Add Qwen 3.5 MoE to cuda-perf CI and add prefill throughput tracking - Add PyTorchObserver stats output to qwen3_5_moe runner (enables cuda_benchmark.py parsing), --prompt_file flag, and GPU memory stats - Add prefill_throughput metric to cuda_benchmark.py (prefill tok/s alongside existing decode tok/s) - Add Qwen3.5-35B-A3B-HQQ-INT4 to cuda-perf.yml with >1000 token prompt and 512 output tokens, on linux.aws.a100 - Align cuda-perf.yml triggers with cuda.yml (push main/release, ciflow/cuda tags, PR on backends/cuda and backends/aoti paths) - Remove random model selection and schedule trigger; always run all models when triggered --- .ci/scripts/cuda_benchmark.py | 32 +++++++++- .github/workflows/cuda-perf.yml | 93 ++++++++++------------------ examples/models/qwen3_5_moe/main.cpp | 66 ++++++++++++++++---- 3 files changed, 119 insertions(+), 72 deletions(-) diff --git a/.ci/scripts/cuda_benchmark.py b/.ci/scripts/cuda_benchmark.py index b135925d4b4..eb511674645 100644 --- a/.ci/scripts/cuda_benchmark.py +++ b/.ci/scripts/cuda_benchmark.py @@ -18,7 +18,9 @@ class RunMetrics: """Metrics from a single run.""" generated_tokens: int + prompt_tokens: int tokens_per_sec: float + prefill_tokens_per_sec: float model_load_time_ms: float total_inference_time_ms: float encoder_time_ms: float @@ -28,7 +30,8 @@ class RunMetrics: def __repr__(self): return ( f"Tokens: {self.generated_tokens}, " - f"Throughput: {self.tokens_per_sec:.2f} t/s, " + f"Prefill: {self.prefill_tokens_per_sec:.2f} t/s ({self.prompt_tokens} tokens), " + f"Decode: {self.tokens_per_sec:.2f} t/s, " f"Model load: {self.model_load_time_ms:.0f}ms, " f"Total inference: {self.total_inference_time_ms:.0f}ms, " f"Encoder: {self.encoder_time_ms:.0f}ms, " @@ -49,6 +52,7 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]: # Extract values generated_tokens = data.get("generated_tokens", 0) + prompt_tokens = data.get("prompt_tokens", 0) inference_start_ms = data.get("inference_start_ms", 0) inference_end_ms = data.get("inference_end_ms", 0) prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0) @@ -72,12 +76,20 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]: if generation_time_ms > 0 else 0 ) + + # Calculate prefill throughput + prefill_tokens_per_sec = ( + (prompt_tokens / encoder_time_ms * 1000) if encoder_time_ms > 0 else 0 + ) + model_load_time_ms = model_load_end_ms - model_load_start_ms first_token_latency_ms = first_token_ms - prompt_eval_end_ms return RunMetrics( generated_tokens=generated_tokens, + prompt_tokens=prompt_tokens, tokens_per_sec=tokens_per_sec, + prefill_tokens_per_sec=prefill_tokens_per_sec, model_load_time_ms=model_load_time_ms, total_inference_time_ms=total_inference_time_ms, encoder_time_ms=encoder_time_ms, @@ -505,6 +517,7 @@ class BenchmarkResults: # Metrics throughput: MetricStats + prefill_throughput: MetricStats model_load_time: MetricStats total_inference_time: MetricStats encoder_time: MetricStats @@ -529,6 +542,10 @@ def to_dict(self) -> dict: "throughput_min": self.throughput.min_val, "throughput_max": self.throughput.max_val, "throughput_stdev": self.throughput.stdev, + "prefill_throughput_mean": self.prefill_throughput.mean, + "prefill_throughput_min": self.prefill_throughput.min_val, + "prefill_throughput_max": self.prefill_throughput.max_val, + "prefill_throughput_stdev": self.prefill_throughput.stdev, "model_load_time_mean": self.model_load_time.mean, "model_load_time_min": self.model_load_time.min_val, "model_load_time_max": self.model_load_time.max_val, @@ -601,6 +618,13 @@ def to_v3_format( runner_type, base_extra_info, ), + self.prefill_throughput.create_v3_record( + model_name_with_quant, + backend, + runner_name, + runner_type, + base_extra_info, + ), self.model_load_time.create_v3_record( model_name_with_quant, backend, @@ -696,6 +720,11 @@ def create_metric_stats( "t/s", {"trimmed_runs": len(trimmed_throughput)}, ), + prefill_throughput=create_metric_stats( + "prefill_throughput(tokens/sec)", + [r.prefill_tokens_per_sec for r in results], + "t/s", + ), model_load_time=create_metric_stats( "model_load_time(ms)", [r.model_load_time_ms for r in results], @@ -740,6 +769,7 @@ def print_summary(summary: BenchmarkResults) -> None: # Print all metrics using their print_stats method summary.throughput.print_stats() + summary.prefill_throughput.print_stats() summary.model_load_time.print_stats() summary.total_inference_time.print_stats() summary.encoder_time.print_stats() diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml index c6846a61a44..5c01da00e04 100644 --- a/.github/workflows/cuda-perf.yml +++ b/.github/workflows/cuda-perf.yml @@ -1,47 +1,39 @@ name: cuda-perf on: - schedule: - - cron: 0 8 * * * # 12am / 1am PST (8am UTC) - pull_request: - paths: - - .github/workflows/cuda-perf.yml - - .ci/scripts/cuda_benchmark.py - - .ci/scripts/export_model_artifact.sh - - .ci/scripts/test_model_e2e.sh push: branches: - main + - release/* + tags: + - ciflow/cuda/* + pull_request: paths: - .github/workflows/cuda-perf.yml + - .github/workflows/cuda.yml - .ci/scripts/cuda_benchmark.py - .ci/scripts/export_model_artifact.sh - .ci/scripts/test_model_e2e.sh + - backends/cuda/** + - backends/aoti/** workflow_dispatch: inputs: models: description: Models to be benchmarked (comma-separated HuggingFace model IDs) required: false type: string - default: openai/whisper-small quantizations: description: Quantization types (comma-separated) required: false type: string - default: non-quantized num_runs: description: Number of benchmark runs per model required: false type: string default: "50" - run_all_models: - description: Run all available models (overrides models input) - required: false - type: boolean - default: false concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true jobs: @@ -60,48 +52,20 @@ jobs: id: set-parameters shell: bash env: - # All available models and quantizations - ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt' + ALL_MODELS: 'mistralai/Voxtral-Mini-3B-2507,openai/whisper-small,openai/whisper-medium,openai/whisper-large-v3-turbo,google/gemma-3-4b-it,nvidia/parakeet-tdt,SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4' ALL_QUANTIZATIONS: 'non-quantized,quantized-int4-tile-packed,quantized-int4-weight-only' NUM_RUNS: ${{ inputs.num_runs || '50' }} - RUN_ALL_MODELS: ${{ inputs.run_all_models || 'false' }} - RANDOM_MODEL: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && 'true' || 'false' }} run: | set -eux MODELS="${{ inputs.models }}" QUANTIZATIONS="${{ inputs.quantizations }}" - # If run_all_models is true, use all models - if [ "$RUN_ALL_MODELS" = "true" ]; then - MODELS="$ALL_MODELS" - echo "Running all available models: $MODELS" - # For non-schedule events (PR, manual trigger without inputs), randomly select one model and one quantization - elif [ -z "$MODELS" ] && [ "${{ github.event_name }}" != "schedule" ]; then - # Split all models into array - IFS=',' read -ra ALL_MODEL_ARRAY <<< "$ALL_MODELS" - # Randomly select one model - RANDOM_MODEL_INDEX=$((RANDOM % ${#ALL_MODEL_ARRAY[@]})) - MODELS="${ALL_MODEL_ARRAY[$RANDOM_MODEL_INDEX]}" - echo "Randomly selected model for PR/push: $MODELS" - elif [ -z "$MODELS" ]; then - # Schedule event: use all models + # Use all models/quantizations unless overridden by workflow_dispatch + if [ -z "$MODELS" ]; then MODELS="$ALL_MODELS" fi - - # If run_all_models is true, use all quantizations - if [ "$RUN_ALL_MODELS" = "true" ]; then - QUANTIZATIONS="$ALL_QUANTIZATIONS" - echo "Running all available quantizations: $QUANTIZATIONS" - elif [ -z "$QUANTIZATIONS" ] && [ "${{ github.event_name }}" != "schedule" ]; then - # Split all quantizations into array - IFS=',' read -ra ALL_QUANT_ARRAY <<< "$ALL_QUANTIZATIONS" - # Randomly select one quantization - RANDOM_QUANT_INDEX=$((RANDOM % ${#ALL_QUANT_ARRAY[@]})) - QUANTIZATIONS="${ALL_QUANT_ARRAY[$RANDOM_QUANT_INDEX]}" - echo "Randomly selected quantization for PR/push: $QUANTIZATIONS" - elif [ -z "$QUANTIZATIONS" ]; then - # Schedule event: use all quantizations + if [ -z "$QUANTIZATIONS" ]; then QUANTIZATIONS="$ALL_QUANTIZATIONS" fi @@ -109,19 +73,15 @@ jobs: IFS=',' read -ra MODEL_ARRAY <<< "$MODELS" IFS=',' read -ra QUANT_ARRAY <<< "$QUANTIZATIONS" - # If random model is requested (for main branch push), select one random model from the already selected models - if [ "$RANDOM_MODEL" = "true" ] && [ ${#MODEL_ARRAY[@]} -gt 1 ]; then - RANDOM_INDEX=$((RANDOM % ${#MODEL_ARRAY[@]})) - MODELS="${MODEL_ARRAY[$RANDOM_INDEX]}" - MODEL_ARRAY=("$MODELS") - echo "Random model selected for main branch push: $MODELS" - fi - - # Generate benchmark configs + # Generate benchmark configs (skip invalid model/quant combinations) CONFIGS='{"include":[' FIRST=true for MODEL in "${MODEL_ARRAY[@]}"; do for QUANT in "${QUANT_ARRAY[@]}"; do + # Qwen3.5 MoE only supports quantized-int4-tile-packed + if [[ "$MODEL" == *"Qwen3.5-35B-A3B"* ]] && [ "$QUANT" != "quantized-int4-tile-packed" ]; then + continue + fi if [ "$FIRST" = true ]; then FIRST=false else @@ -152,7 +112,7 @@ jobs: with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN - runner: linux.g5.4xlarge.nvidia.gpu + runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda gpu-arch-version: "12.6" use-custom-docker-registry: false @@ -162,6 +122,8 @@ jobs: script: | set -eux echo "::group::Setup ExecuTorch" + # Disable MKL to avoid duplicate target error when conda has multiple MKL installations + export USE_MKL=OFF ./install_executorch.sh echo "::endgroup::" @@ -198,7 +160,7 @@ jobs: fail-fast: false with: timeout: 90 - runner: linux.g5.4xlarge.nvidia.gpu + runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda gpu-arch-version: "12.6" use-custom-docker-registry: false @@ -299,6 +261,19 @@ jobs: RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --audio_path $AUDIO --tokenizer_path $TOKENIZER" MODEL_NAME="parakeet_${{ matrix.quant }}" ;; + SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4) + RUNNER="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner" + TOKENIZER="model_artifacts/tokenizer.json" + # Generate a >1000 token prompt for benchmarking + python3 -c " + text = 'The quick brown fox jumps over the lazy dog. ' * 200 + prompt = '<|im_start|>user\nPlease analyze and summarize the following text in detail:\n\n' + text + '\n<|im_end|>\n<|im_start|>assistant\n' + with open('model_artifacts/long_prompt.txt', 'w') as f: + f.write(prompt) + " + RUNNER_CMD="$RUNNER --model_path model_artifacts/model.pte --data_path model_artifacts/aoti_cuda_blob.ptd --tokenizer_path $TOKENIZER --prompt_file model_artifacts/long_prompt.txt --max_new_tokens 512 --temperature 0" + MODEL_NAME="qwen3_5_moe_${{ matrix.quant }}" + ;; *) echo "Error: Unsupported model '${{ matrix.model }}'" exit 1 diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp index 7f4e60596be..91c7a16f834 100644 --- a/examples/models/qwen3_5_moe/main.cpp +++ b/examples/models/qwen3_5_moe/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include @@ -26,6 +26,10 @@ DEFINE_string(model_path, "", "Model .pte file path."); DEFINE_string(data_path, "", "Data file (.ptd) for CUDA backend."); DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path."); DEFINE_string(prompt, "Hello", "Prompt text."); +DEFINE_string( + prompt_file, + "", + "Path to file containing prompt text (overrides --prompt)."); DEFINE_double(temperature, 0.8, "Sampling temperature (0 = greedy)."); DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate."); @@ -50,6 +54,16 @@ int main(int argc, char** argv) { return 1; } + llm::Stats stats; + + // GPU memory before load + size_t gpu_free_bytes, gpu_total_bytes; + cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); + stats.gpu_total_bytes = gpu_total_bytes; + stats.gpu_free_before_load_bytes = gpu_free_bytes; + + stats.model_load_start_ms = llm::time_in_ms(); + // Load tokenizer auto tokenizer = std::make_unique(); auto tok_status = tokenizer->load(FLAGS_tokenizer_path); @@ -109,11 +123,30 @@ int main(int argc, char** argv) { } } + stats.model_load_end_ms = llm::time_in_ms(); + + // GPU memory after load + cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); + stats.gpu_free_after_load_bytes = gpu_free_bytes; + // Get EOS ids auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get()); + // Read prompt from file or flag + std::string prompt_text = FLAGS_prompt; + if (!FLAGS_prompt_file.empty()) { + std::ifstream f(FLAGS_prompt_file); + if (!f.is_open()) { + ET_LOG( + Error, "Failed to open prompt file: %s", FLAGS_prompt_file.c_str()); + return 1; + } + prompt_text = std::string( + (std::istreambuf_iterator(f)), std::istreambuf_iterator()); + } + // Encode prompt - auto encode_result = tokenizer->encode(FLAGS_prompt); + auto encode_result = tokenizer->encode(prompt_text); if (!encode_result.ok()) { ET_LOG(Error, "Failed to encode prompt"); return 1; @@ -122,13 +155,15 @@ int main(int argc, char** argv) { int64_t num_prompt_tokens = prompt_tokens.size(); printf("Prompt tokens: %ld\n", num_prompt_tokens); + stats.num_prompt_tokens = num_prompt_tokens; + stats.inference_start_ms = llm::time_in_ms(); + // --------------------------------------------------------------- // Prefill or decode-only // --------------------------------------------------------------- auto S = [](int64_t v) -> SizesType { return static_cast(v); }; uint64_t cur_token = 0; - auto prefill_start = std::chrono::steady_clock::now(); // Chunked prefill std::vector pos_data(num_prompt_tokens); @@ -161,10 +196,11 @@ int main(int argc, char** argv) { std::make_shared(std::move(logits_tensor)); cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature); - auto prefill_end = std::chrono::steady_clock::now(); + stats.prompt_eval_end_ms = llm::time_in_ms(); + stats.first_token_ms = stats.prompt_eval_end_ms; + double prefill_ms = - std::chrono::duration(prefill_end - prefill_start) - .count(); + (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); printf( "Prefill: %ld tokens in %.1f ms (%.1f tok/s)\n", num_prompt_tokens, @@ -184,7 +220,6 @@ int main(int argc, char** argv) { // --------------------------------------------------------------- // Decode — generate tokens one at a time // --------------------------------------------------------------- - llm::Stats stats; int64_t pos = num_prompt_tokens; uint64_t prev_token; @@ -195,8 +230,6 @@ int main(int argc, char** argv) { auto decode_pos = from_blob( decode_pos_data.data(), {1}, executorch::aten::ScalarType::Long); - auto decode_start = std::chrono::steady_clock::now(); - for (int32_t step = 0; step < FLAGS_max_new_tokens; step++) { decode_token_data[0] = static_cast(cur_token); decode_pos_data[0] = pos; @@ -235,13 +268,14 @@ int main(int argc, char** argv) { } } - auto decode_end = std::chrono::steady_clock::now(); + stats.inference_end_ms = llm::time_in_ms(); printf("\n"); int64_t num_generated = pos - num_prompt_tokens; + stats.num_generated_tokens = num_generated; + double decode_ms = - std::chrono::duration(decode_end - decode_start) - .count(); + (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); printf( "Decode: %ld tokens in %.1f ms (%.1f tok/s)\n", num_generated, @@ -249,5 +283,13 @@ int main(int argc, char** argv) { num_generated * 1000.0 / decode_ms); printf("Prompt tokens: %ld\n", num_prompt_tokens); + // GPU memory after generation + cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); + stats.gpu_free_after_generate_bytes = gpu_free_bytes; + stats.gpu_peak_usage_mb = + (stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0; + + llm::print_report(stats); + return 0; } From 36213f90c5f7d42ba1a5f7a5e187ba7f2c160b96 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 15 Apr 2026 14:49:01 -0700 Subject: [PATCH 2/2] fix 0 first token --- examples/models/qwen3_5_moe/main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp index 91c7a16f834..168fdc8f395 100644 --- a/examples/models/qwen3_5_moe/main.cpp +++ b/examples/models/qwen3_5_moe/main.cpp @@ -197,7 +197,6 @@ int main(int argc, char** argv) { cur_token = llm::logits_to_token(*logits_ptr, FLAGS_temperature); stats.prompt_eval_end_ms = llm::time_in_ms(); - stats.first_token_ms = stats.prompt_eval_end_ms; double prefill_ms = (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); @@ -254,6 +253,10 @@ int main(int argc, char** argv) { cur_token = llm::logits_to_token(*step_logits_ptr, FLAGS_temperature); stats.on_sampling_end(); + if (step == 0) { + stats.first_token_ms = llm::time_in_ms(); + } + pos++; auto decode_str = tokenizer->decode(prev_token, cur_token);