From aba72236eb6bd335572c2fca102b625e76278af7 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 16 Apr 2026 06:52:09 +0000 Subject: [PATCH 1/2] Add qwen3 moe experts only test Signed-off-by: Chenjie Luo --- .../quantization/plugins/test_huggingface.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 692ab07d4a..5ae4e82e09 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -234,6 +234,57 @@ def test_is_homogeneous_hf_model_gpt_oss(): assert is_homogeneous_hf_model(model) +def test_qwen3_moe_nvfp4_experts_only_export_exclude_modules(tmp_path): + """Test that NVFP4_EXPERTS_ONLY_CFG correctly excludes non-expert modules in HF export. + + For a Qwen3 MoE model, only routed expert layers (mlp.experts.*) should be quantized. + Attention layers and lm_head should appear in the exported hf_quant_config.json + exclude_modules. + + Reference: https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4/blob/main/hf_quant_config.json + """ + from modelopt.torch.export.unified_export_hf import export_hf_checkpoint + + model = get_tiny_qwen3_moe() + # from_config doesn't set architectures; export code requires it + model.config.architectures = ["Qwen3MoeForCausalLM"] + + # Quantize with NVFP4_EXPERTS_ONLY_CFG (targets only *mlp.experts* patterns) + mtq.quantize(model, mtq.NVFP4_EXPERTS_ONLY_CFG, lambda m: m(**m.dummy_inputs)) + + # Export + export_dir = tmp_path / "qwen3_moe_nvfp4_experts_only" + export_hf_checkpoint(model, export_dir=export_dir) + + # Load the generated hf_quant_config.json + import json + + hf_quant_config_path = export_dir / "hf_quant_config.json" + assert hf_quant_config_path.exists(), "hf_quant_config.json should be generated" + with open(hf_quant_config_path) as f: + hf_quant_config = json.load(f) + + quant_section = hf_quant_config["quantization"] + assert quant_section["quant_algo"] == "NVFP4" + exclude_modules = quant_section["exclude_modules"] + + # Attention layers must be excluded + assert any("self_attn" in m for m in exclude_modules), ( + f"self_attn should be in exclude_modules, got: {exclude_modules}" + ) + + # lm_head must be excluded + assert any("lm_head" in m for m in exclude_modules), ( + f"lm_head should be in exclude_modules, got: {exclude_modules}" + ) + + # No exclude pattern should match the routed experts + for pattern in exclude_modules: + assert not ("mlp.experts." in pattern and "shared" not in pattern), ( + f"Routed expert pattern should NOT be excluded: {pattern}" + ) + + def test_hf_decoder_discoverer_registration_path(): model = get_tiny_llama() assert any( From 78046c48e48662d4af1174c629e53542638b0a0c Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 16 Apr 2026 23:26:44 +0000 Subject: [PATCH 2/2] Fix comments Signed-off-by: Chenjie Luo --- tests/gpu/torch/export/test_export.py | 53 +++++++++++++++++++ .../quantization/plugins/test_huggingface.py | 51 ------------------ 2 files changed, 53 insertions(+), 51 deletions(-) diff --git a/tests/gpu/torch/export/test_export.py b/tests/gpu/torch/export/test_export.py index 55eee2c138..ef7c4474f8 100644 --- a/tests/gpu/torch/export/test_export.py +++ b/tests/gpu/torch/export/test_export.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +from fnmatch import fnmatch + import pytest import torch from _test_utils.torch.export.utils import ( @@ -29,6 +32,7 @@ partial_nvfp4_config, partial_w4a8_config, ) +from _test_utils.torch.transformers_models import get_tiny_qwen3_moe import modelopt.torch.quantization as mtq from modelopt.torch.export.model_config import ( @@ -53,6 +57,7 @@ postprocess_state_dict, process_layer_quant_config, ) +from modelopt.torch.export.unified_export_hf import export_hf_checkpoint from modelopt.torch.quantization.config import ( FP8_DEFAULT_CFG, INT4_AWQ_CFG, @@ -60,6 +65,7 @@ INT8_WEIGHT_ONLY_CFG, NVFP4_AWQ_LITE_CFG, NVFP4_DEFAULT_CFG, + NVFP4_EXPERTS_ONLY_CFG, W4A8_AWQ_BETA_CFG, ) from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer @@ -466,3 +472,50 @@ def test_get_quant_config(config, expected): mtq.quantize(model, config, lambda x: x(torch.randn(1, 4, 10, device="cuda"))) quant_config = get_quant_config(model) assert quant_config["quantization"] == expected + + +def test_qwen3_moe_nvfp4_experts_only_export_exclude_modules(tmp_path): + """Test that NVFP4_EXPERTS_ONLY_CFG correctly excludes non-expert modules in HF export. + + For a Qwen3 MoE model, only routed expert layers (mlp.experts.*) should be quantized. + Attention layers and lm_head should appear in the exported hf_quant_config.json + exclude_modules. + + Reference: https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4/blob/main/hf_quant_config.json + """ + model = get_tiny_qwen3_moe().to("cuda") + # from_config doesn't set architectures; export code requires it + model.config.architectures = ["Qwen3MoeForCausalLM"] + + # Quantize with NVFP4_EXPERTS_ONLY_CFG (targets only *mlp.experts* patterns) + mtq.quantize(model, NVFP4_EXPERTS_ONLY_CFG, lambda m: m(**m.dummy_inputs)) + + # Export + export_dir = tmp_path / "qwen3_moe_nvfp4_experts_only" + export_hf_checkpoint(model, export_dir=export_dir) + + # Load the generated hf_quant_config.json + hf_quant_config_path = export_dir / "hf_quant_config.json" + assert hf_quant_config_path.exists(), "hf_quant_config.json should be generated" + with open(hf_quant_config_path) as f: + hf_quant_config = json.load(f) + + quant_section = hf_quant_config["quantization"] + assert quant_section["quant_algo"] == "NVFP4" + exclude_modules = quant_section["exclude_modules"] + + def is_excluded(module_name: str) -> bool: + return any(fnmatch(module_name, pattern) for pattern in exclude_modules) + + # Attention layers must be excluded + assert is_excluded("model.layers.0.self_attn.q_proj"), ( + f"self_attn should be excluded, got patterns: {exclude_modules}" + ) + + # lm_head must be excluded + assert is_excluded("lm_head"), f"lm_head should be excluded, got patterns: {exclude_modules}" + + # Routed experts should NOT be excluded + assert not is_excluded("model.layers.0.mlp.experts.0.down_proj"), ( + f"Routed experts should not be excluded, got patterns: {exclude_modules}" + ) diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 5ae4e82e09..692ab07d4a 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -234,57 +234,6 @@ def test_is_homogeneous_hf_model_gpt_oss(): assert is_homogeneous_hf_model(model) -def test_qwen3_moe_nvfp4_experts_only_export_exclude_modules(tmp_path): - """Test that NVFP4_EXPERTS_ONLY_CFG correctly excludes non-expert modules in HF export. - - For a Qwen3 MoE model, only routed expert layers (mlp.experts.*) should be quantized. - Attention layers and lm_head should appear in the exported hf_quant_config.json - exclude_modules. - - Reference: https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4/blob/main/hf_quant_config.json - """ - from modelopt.torch.export.unified_export_hf import export_hf_checkpoint - - model = get_tiny_qwen3_moe() - # from_config doesn't set architectures; export code requires it - model.config.architectures = ["Qwen3MoeForCausalLM"] - - # Quantize with NVFP4_EXPERTS_ONLY_CFG (targets only *mlp.experts* patterns) - mtq.quantize(model, mtq.NVFP4_EXPERTS_ONLY_CFG, lambda m: m(**m.dummy_inputs)) - - # Export - export_dir = tmp_path / "qwen3_moe_nvfp4_experts_only" - export_hf_checkpoint(model, export_dir=export_dir) - - # Load the generated hf_quant_config.json - import json - - hf_quant_config_path = export_dir / "hf_quant_config.json" - assert hf_quant_config_path.exists(), "hf_quant_config.json should be generated" - with open(hf_quant_config_path) as f: - hf_quant_config = json.load(f) - - quant_section = hf_quant_config["quantization"] - assert quant_section["quant_algo"] == "NVFP4" - exclude_modules = quant_section["exclude_modules"] - - # Attention layers must be excluded - assert any("self_attn" in m for m in exclude_modules), ( - f"self_attn should be in exclude_modules, got: {exclude_modules}" - ) - - # lm_head must be excluded - assert any("lm_head" in m for m in exclude_modules), ( - f"lm_head should be in exclude_modules, got: {exclude_modules}" - ) - - # No exclude pattern should match the routed experts - for pattern in exclude_modules: - assert not ("mlp.experts." in pattern and "shared" not in pattern), ( - f"Routed expert pattern should NOT be excluded: {pattern}" - ) - - def test_hf_decoder_discoverer_registration_path(): model = get_tiny_llama() assert any(