FNLCR-DMAP · fangliu117 · Apr 21, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 27, 2026
diff --git a/src/spac/templates/phenograph_clustering_cpu_template.py b/src/spac/templates/phenograph_clustering_cpu_template.py
@@ -0,0 +1,113 @@
+"""
+Galaxy CPU template for PhenoGraph clustering.
+
+Calls the standalone CPU path in
+``spac.transform.clustering.phenograph.cpu.phenograph_cpu``. INDEPENDENT of
+the legacy ``spac.transformations.phenograph_clustering`` function — that
+continues to serve the legacy ``phenograph_clustering.xml`` tool unchanged.
+
+Usage
+-----
+>>> from spac.templates.phenograph_clustering_cpu_template import run_from_json
+>>> run_from_json("examples/phenograph_clustering_cpu_params.json")
+"""
+import sys
+from pathlib import Path
+from typing import Any, Dict, Union
+
+# Add parent directory to path for SPAC imports (matches legacy template pattern)
+sys.path.append(str(Path(__file__).parent.parent.parent))
+
+from spac.transform.clustering.phenograph import phenograph_cpu
+from spac.templates.template_utils import (
+    load_input,
+    parse_params,
+    save_results,
+    text_to_value,
+)
+
+
+def run_from_json(
+    json_path: Union[str, Path, Dict[str, Any]],
+    save_to_disk: bool = True,
+    output_dir: str = None,
+):
+    """
+    Execute CPU PhenoGraph Clustering from Galaxy JSON parameters.
+
+    Mirrors the legacy template's run_from_json contract for drop-in
+    replacement, minus the dead HPC fields.
+    """
+    params = parse_params(json_path)
+
+    if output_dir is None:
+        output_dir = params.get("Output_Directory", ".")
+
+    if "outputs" not in params:
+        params["outputs"] = {
+            "analysis": {"type": "file", "name": "output.pickle"}
+        }
+
+    # Load AnnData
+    adata = load_input(params["Upstream_Analysis"])
+
+    # Extract parameters
+    layer = params.get("Table_to_Process", "Original")
+    if layer == "Original":
+        layer = None
+
+    k = int(params.get("K_Nearest_Neighbors", 30))
+    seed = int(params.get("Seed", 42))
+    resolution = float(params.get("Resolution_Parameter", 1.0))
+    output_name = params.get("Output_Annotation_Name", "phenograph")
+    n_iterations = int(params.get("Number_of_Iterations", 100))
+
+    print("Before PhenoGraph CPU Clustering:\n", adata)
+
+    phenograph_cpu(
+        adata=adata,
+        features=None,  # use all var index
+        layer=layer,
+        k=k,
+        seed=seed,
+        resolution_parameter=resolution,
+        n_iterations=n_iterations,
+        output_annotation=output_name,
+    )
+
+    print(f'Count of cells in the output annotation "{output_name}":')
+    print(adata.obs[output_name].value_counts())
+    print("\nAfter PhenoGraph CPU Clustering:\n", adata)
+
+    if save_to_disk:
+        saved_files = save_results(
+            results={"analysis": adata},
+            params=params,
+            output_base_dir=output_dir,
+        )
+        print(
+            f"PhenoGraph CPU Clustering completed -> {saved_files['analysis']}"
+        )
+        return saved_files
+
+    return adata
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(
+            "Usage: python phenograph_clustering_cpu_template.py "
+            "<params.json> [output_dir]",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+    result = run_from_json(json_path=sys.argv[1], output_dir=output_dir)
+
+    if isinstance(result, dict):
+        print("\nOutput files:")
+        for name, path in result.items():
+            print(f"  {name}: {path}")
+    else:
+        print("\nReturned AnnData object")
diff --git a/src/spac/templates/phenograph_clustering_gpu_template.py b/src/spac/templates/phenograph_clustering_gpu_template.py
@@ -0,0 +1,146 @@
+"""
+Galaxy GPU template for PhenoGraph clustering.
+
+Calls ``spac.transform.clustering.phenograph.gpu.grapheno.phenograph_gpu``
+inside the GPU Docker container (``nciccbr/spac-gpu``). Supports profiling
+mode, which runs Leiden at multiple resolutions sharing the same KNN and
+Jaccard computation (a grapheno feature exposed via the parquet cache in
+the job work directory).
+
+Usage
+-----
+>>> from spac.templates.phenograph_clustering_gpu_template import run_from_json
+>>> run_from_json("examples/phenograph_clustering_gpu_params.json")
+"""
+import sys
+from pathlib import Path
+from typing import Any, Dict, Union
+
+sys.path.append(str(Path(__file__).parent.parent.parent))
+
+from spac.transform.clustering.phenograph import phenograph_gpu
+from spac.templates.template_utils import (
+    load_input,
+    parse_params,
+    save_results,
+    text_to_value,
+)
+
+
+def _resolutions(params):
+    profiling = bool(params.get("Profiling", False))
+    res_list = params.get("Resolution_List") or []
+    res_param = params.get("Resolution_Parameter", 1.0)
+
+    if profiling:
+        if not res_list:
+            raise ValueError(
+                "Profiling=True but Resolution_List is empty."
+            )
+        return True, [float(r) for r in res_list]
+    return False, [float(res_param)]
+
+
+def run_from_json(
+    json_path: Union[str, Path, Dict[str, Any]],
+    save_to_disk: bool = True,
+    output_dir: str = None,
+):
+    """
+    Execute GPU PhenoGraph Clustering from Galaxy JSON parameters.
+    """
+    if phenograph_gpu is None:
+        raise RuntimeError(
+            "GPU backend not available. Ensure RAPIDS (cuml, cugraph) is "
+            "installed and this template is running inside the GPU Docker "
+            "image (nciccbr/spac-gpu)."
+        )
+
+    params = parse_params(json_path)
+
+    if output_dir is None:
+        output_dir = params.get("Output_Directory", ".")
+
+    if "outputs" not in params:
+        params["outputs"] = {
+            "analysis": {"type": "file", "name": "output.pickle"}
+        }
+
+    # Load AnnData
+    adata = load_input(params["Upstream_Analysis"])
+
+    layer = params.get("Table_to_Process", "Original")
+    if layer == "Original":
+        layer = None
+
+    k = int(params.get("K_Nearest_Neighbors", 30))
+    seed = int(params.get("Seed", 42))
+    output_name = params.get("Output_Annotation_Name", "phenograph")
+    n_iterations = int(params.get("Number_of_Iterations", 100))
+
+    profiling, resolutions = _resolutions(params)
+
+    print("Before PhenoGraph GPU Clustering:\n", adata)
+    print(f"Mode: {'profiling' if profiling else 'single'}, resolutions={resolutions}")
+
+    # In profiling mode, the first resolution's call populates the parquet cache
+    # (KNN + Jaccard); subsequent calls only re-run Leiden. The cache lives in
+    # the process cwd, which Galaxy sets to the per-job work directory.
+    for res in resolutions:
+        col_name = (
+            f"{output_name}-{k}-{res}" if profiling else output_name
+        )
+        phenograph_gpu(
+            adata=adata,
+            features=None,
+            layer=layer,
+            k=k,
+            seed=seed,
+            resolution_parameter=res,
+            n_iterations=n_iterations,
+            output_annotation=col_name,
+        )
+        n_clusters = int(adata.obs[col_name].nunique())
+        print(f"  {col_name}: {n_clusters} clusters")
+
+    # Summary column for provenance (single-resolution mode uses output_name;
+    # profiling mode uses the last resolution as a reference).
+    summary_col = (
+        f"{output_name}-{k}-{resolutions[-1]}" if profiling else output_name
+    )
+    print(f'\nCount of cells in "{summary_col}":')
+    print(adata.obs[summary_col].value_counts())
+    print("\nAfter PhenoGraph GPU Clustering:\n", adata)
+
+    if save_to_disk:
+        saved_files = save_results(
+            results={"analysis": adata},
+            params=params,
+            output_base_dir=output_dir,
+        )
+        print(
+            f"PhenoGraph GPU Clustering completed -> {saved_files['analysis']}"
+        )
+        return saved_files
+
+    return adata
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(
+            "Usage: python phenograph_clustering_gpu_template.py "
+            "<params.json> [output_dir]",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+    result = run_from_json(json_path=sys.argv[1], output_dir=output_dir)
+
+    if isinstance(result, dict):
+        print("\nOutput files:")
+        for name, path in result.items():
+            print(f"  {name}: {path}")
+    else:
+        print("\nReturned AnnData object")
diff --git a/src/spac/templates/umap_transformation_template.py b/src/spac/templates/umap_transformation_template.py
@@ -100,7 +100,10 @@ def run_from_json(
     # Load the upstream analysis data
     adata = load_input(params["Upstream_Analysis"])
 
-    # Extract parameters - Note: HPC parameters are ignored in SPAC version
+    # Extract parameters. HPC parameters (Run_on_HPC, Batch_Mode, Partition,
+    # Number_of_CPUs, Memory_GB, Request_Time) were removed from the Galaxy
+    # tool in v2.1.0 and are no longer sent. params.get() is kept for safety
+    # so older JSON blueprints containing those keys still load cleanly.
     n_neighbors = params.get("Number_of_Neighbors", 75)
     min_dist = params.get("Minimum_Distance_between_Points", 0.1)
     n_components = params.get("Target_Dimension_Number", 2)

diff --git a/src/spac/transform/__init__.py b/src/spac/transform/__init__.py
@@ -0,0 +1,7 @@
+"""
+spac.transform — transformation algorithms, organized by type.
+
+This module is a new top-level namespace per the April 17 refactoring decision.
+For now, only phenograph clustering lives here. The rest of the transformations
+continue to live in ``spac.transformations`` and will migrate gradually.
+"""
diff --git a/src/spac/transform/clustering/__init__.py b/src/spac/transform/clustering/__init__.py
@@ -0,0 +1,3 @@
+"""
+spac.transform.clustering — clustering algorithms.
+"""
diff --git a/src/spac/transform/clustering/phenograph/__init__.py b/src/spac/transform/clustering/phenograph/__init__.py
@@ -0,0 +1,37 @@
+"""
+spac.transform.clustering.phenograph — PhenoGraph clustering (CPU and GPU).
+
+Public entry points
+-------------------
+prepare_features(adata, layer, features)
+    Shared preprocessing: extract a dense float32 feature matrix from AnnData.
+    Used by BOTH cpu and gpu paths to guarantee identical input.
+
+phenograph_cpu(adata, features, layer, k, seed, ...)
+    CPU PhenoGraph via the `phenograph` Python package.
+    INDEPENDENT of spac.transformations.phenograph_clustering.
+
+phenograph_gpu(adata, features, layer, k, seed, ...)
+    GPU PhenoGraph via grapheno (RAPIDS cuml + cugraph).
+    Only importable inside the GPU container; returns None in CPU-only envs.
+
+Notes
+-----
+This module is a deliberate parallel to the Biowulf ``spac_phenograph/CPU/``
+and ``spac_phenograph/GPU/`` folder split. Each path is a standalone
+implementation; neither wraps the other. The legacy
+``spac.transformations.phenograph_clustering`` function is untouched and
+continues to serve the legacy ``phenograph_clustering.xml`` Galaxy tool.
+"""
+
+from .preprocess import prepare_features
+from .cpu import phenograph_cpu
+
+# GPU import is deferred. Import failures in CPU-only environments must not
+# break ``from spac.transform.clustering.phenograph import phenograph_cpu``.
+try:
+    from .gpu.grapheno import phenograph_gpu
+except ImportError:
+    phenograph_gpu = None
+
+__all__ = ["prepare_features", "phenograph_cpu", "phenograph_gpu"]