From aba72236eb6bd335572c2fca102b625e76278af7 Mon Sep 17 00:00:00 2001
From: Chenjie Luo <chenjiel@nvidia.com>
Date: Thu, 16 Apr 2026 06:52:09 +0000
Subject: [PATCH 1/2] Add qwen3 moe experts only test

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 .../quantization/plugins/test_huggingface.py  | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
index 692ab07d4a..5ae4e82e09 100644
--- a/tests/unit/torch/quantization/plugins/test_huggingface.py
+++ b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -234,6 +234,57 @@ def test_is_homogeneous_hf_model_gpt_oss():
     assert is_homogeneous_hf_model(model)
 
 
+def test_qwen3_moe_nvfp4_experts_only_export_exclude_modules(tmp_path):
+    """Test that NVFP4_EXPERTS_ONLY_CFG correctly excludes non-expert modules in HF export.
+
+    For a Qwen3 MoE model, only routed expert layers (mlp.experts.*) should be quantized.
+    Attention layers and lm_head should appear in the exported hf_quant_config.json
+    exclude_modules.
+
+    Reference: https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4/blob/main/hf_quant_config.json
+    """
+    from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
+
+    model = get_tiny_qwen3_moe()
+    # from_config doesn't set architectures; export code requires it
+    model.config.architectures = ["Qwen3MoeForCausalLM"]
+
+    # Quantize with NVFP4_EXPERTS_ONLY_CFG (targets only *mlp.experts* patterns)
+    mtq.quantize(model, mtq.NVFP4_EXPERTS_ONLY_CFG, lambda m: m(**m.dummy_inputs))
+
+    # Export
+    export_dir = tmp_path / "qwen3_moe_nvfp4_experts_only"
+    export_hf_checkpoint(model, export_dir=export_dir)
+
+    # Load the generated hf_quant_config.json
+    import json
+
+    hf_quant_config_path = export_dir / "hf_quant_config.json"
+    assert hf_quant_config_path.exists(), "hf_quant_config.json should be generated"
+    with open(hf_quant_config_path) as f:
+        hf_quant_config = json.load(f)
+
+    quant_section = hf_quant_config["quantization"]
+    assert quant_section["quant_algo"] == "NVFP4"
+    exclude_modules = quant_section["exclude_modules"]
+
+    # Attention layers must be excluded
+    assert any("self_attn" in m for m in exclude_modules), (
+        f"self_attn should be in exclude_modules, got: {exclude_modules}"
+    )
+
+    # lm_head must be excluded
+    assert any("lm_head" in m for m in exclude_modules), (
+        f"lm_head should be in exclude_modules, got: {exclude_modules}"
+    )
+
+    # No exclude pattern should match the routed experts
+    for pattern in exclude_modules:
+        assert not ("mlp.experts." in pattern and "shared" not in pattern), (
+            f"Routed expert pattern should NOT be excluded: {pattern}"
+        )
+
+
 def test_hf_decoder_discoverer_registration_path():
     model = get_tiny_llama()
     assert any(

From 78046c48e48662d4af1174c629e53542638b0a0c Mon Sep 17 00:00:00 2001
From: Chenjie Luo <chenjiel@nvidia.com>
Date: Thu, 16 Apr 2026 23:26:44 +0000
Subject: [PATCH 2/2] Fix comments

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 tests/gpu/torch/export/test_export.py         | 53 +++++++++++++++++++
 .../quantization/plugins/test_huggingface.py  | 51 ------------------
 2 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/tests/gpu/torch/export/test_export.py b/tests/gpu/torch/export/test_export.py
index 55eee2c138..ef7c4474f8 100644
--- a/tests/gpu/torch/export/test_export.py
+++ b/tests/gpu/torch/export/test_export.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+from fnmatch import fnmatch
+
 import pytest
 import torch
 from _test_utils.torch.export.utils import (
@@ -29,6 +32,7 @@
     partial_nvfp4_config,
     partial_w4a8_config,
 )
+from _test_utils.torch.transformers_models import get_tiny_qwen3_moe
 
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export.model_config import (
@@ -53,6 +57,7 @@
     postprocess_state_dict,
     process_layer_quant_config,
 )
+from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
 from modelopt.torch.quantization.config import (
     FP8_DEFAULT_CFG,
     INT4_AWQ_CFG,
@@ -60,6 +65,7 @@
     INT8_WEIGHT_ONLY_CFG,
     NVFP4_AWQ_LITE_CFG,
     NVFP4_DEFAULT_CFG,
+    NVFP4_EXPERTS_ONLY_CFG,
     W4A8_AWQ_BETA_CFG,
 )
 from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
@@ -466,3 +472,50 @@ def test_get_quant_config(config, expected):
     mtq.quantize(model, config, lambda x: x(torch.randn(1, 4, 10, device="cuda")))
     quant_config = get_quant_config(model)
     assert quant_config["quantization"] == expected
+
+
+def test_qwen3_moe_nvfp4_experts_only_export_exclude_modules(tmp_path):
+    """Test that NVFP4_EXPERTS_ONLY_CFG correctly excludes non-expert modules in HF export.
+
+    For a Qwen3 MoE model, only routed expert layers (mlp.experts.*) should be quantized.
+    Attention layers and lm_head should appear in the exported hf_quant_config.json
+    exclude_modules.
+
+    Reference: https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4/blob/main/hf_quant_config.json
+    """
+    model = get_tiny_qwen3_moe().to("cuda")
+    # from_config doesn't set architectures; export code requires it
+    model.config.architectures = ["Qwen3MoeForCausalLM"]
+
+    # Quantize with NVFP4_EXPERTS_ONLY_CFG (targets only *mlp.experts* patterns)
+    mtq.quantize(model, NVFP4_EXPERTS_ONLY_CFG, lambda m: m(**m.dummy_inputs))
+
+    # Export
+    export_dir = tmp_path / "qwen3_moe_nvfp4_experts_only"
+    export_hf_checkpoint(model, export_dir=export_dir)
+
+    # Load the generated hf_quant_config.json
+    hf_quant_config_path = export_dir / "hf_quant_config.json"
+    assert hf_quant_config_path.exists(), "hf_quant_config.json should be generated"
+    with open(hf_quant_config_path) as f:
+        hf_quant_config = json.load(f)
+
+    quant_section = hf_quant_config["quantization"]
+    assert quant_section["quant_algo"] == "NVFP4"
+    exclude_modules = quant_section["exclude_modules"]
+
+    def is_excluded(module_name: str) -> bool:
+        return any(fnmatch(module_name, pattern) for pattern in exclude_modules)
+
+    # Attention layers must be excluded
+    assert is_excluded("model.layers.0.self_attn.q_proj"), (
+        f"self_attn should be excluded, got patterns: {exclude_modules}"
+    )
+
+    # lm_head must be excluded
+    assert is_excluded("lm_head"), f"lm_head should be excluded, got patterns: {exclude_modules}"
+
+    # Routed experts should NOT be excluded
+    assert not is_excluded("model.layers.0.mlp.experts.0.down_proj"), (
+        f"Routed experts should not be excluded, got patterns: {exclude_modules}"
+    )
diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py
index 5ae4e82e09..692ab07d4a 100644
--- a/tests/unit/torch/quantization/plugins/test_huggingface.py
+++ b/tests/unit/torch/quantization/plugins/test_huggingface.py
@@ -234,57 +234,6 @@ def test_is_homogeneous_hf_model_gpt_oss():
     assert is_homogeneous_hf_model(model)
 
 
-def test_qwen3_moe_nvfp4_experts_only_export_exclude_modules(tmp_path):
-    """Test that NVFP4_EXPERTS_ONLY_CFG correctly excludes non-expert modules in HF export.
-
-    For a Qwen3 MoE model, only routed expert layers (mlp.experts.*) should be quantized.
-    Attention layers and lm_head should appear in the exported hf_quant_config.json
-    exclude_modules.
-
-    Reference: https://huggingface.co/nvidia/Qwen3.5-397B-A17B-NVFP4/blob/main/hf_quant_config.json
-    """
-    from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
-
-    model = get_tiny_qwen3_moe()
-    # from_config doesn't set architectures; export code requires it
-    model.config.architectures = ["Qwen3MoeForCausalLM"]
-
-    # Quantize with NVFP4_EXPERTS_ONLY_CFG (targets only *mlp.experts* patterns)
-    mtq.quantize(model, mtq.NVFP4_EXPERTS_ONLY_CFG, lambda m: m(**m.dummy_inputs))
-
-    # Export
-    export_dir = tmp_path / "qwen3_moe_nvfp4_experts_only"
-    export_hf_checkpoint(model, export_dir=export_dir)
-
-    # Load the generated hf_quant_config.json
-    import json
-
-    hf_quant_config_path = export_dir / "hf_quant_config.json"
-    assert hf_quant_config_path.exists(), "hf_quant_config.json should be generated"
-    with open(hf_quant_config_path) as f:
-        hf_quant_config = json.load(f)
-
-    quant_section = hf_quant_config["quantization"]
-    assert quant_section["quant_algo"] == "NVFP4"
-    exclude_modules = quant_section["exclude_modules"]
-
-    # Attention layers must be excluded
-    assert any("self_attn" in m for m in exclude_modules), (
-        f"self_attn should be in exclude_modules, got: {exclude_modules}"
-    )
-
-    # lm_head must be excluded
-    assert any("lm_head" in m for m in exclude_modules), (
-        f"lm_head should be in exclude_modules, got: {exclude_modules}"
-    )
-
-    # No exclude pattern should match the routed experts
-    for pattern in exclude_modules:
-        assert not ("mlp.experts." in pattern and "shared" not in pattern), (
-            f"Routed expert pattern should NOT be excluded: {pattern}"
-        )
-
-
 def test_hf_decoder_discoverer_registration_path():
     model = get_tiny_llama()
     assert any(