Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions src/spac/templates/phenograph_clustering_cpu_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Galaxy CPU template for PhenoGraph clustering.

Calls the standalone CPU path in
``spac.transform.clustering.phenograph.cpu.phenograph_cpu``. INDEPENDENT of
the legacy ``spac.transformations.phenograph_clustering`` function — that
continues to serve the legacy ``phenograph_clustering.xml`` tool unchanged.

Usage
-----
>>> from spac.templates.phenograph_clustering_cpu_template import run_from_json
>>> run_from_json("examples/phenograph_clustering_cpu_params.json")
"""
import sys
from pathlib import Path
from typing import Any, Dict, Union

# Add parent directory to path for SPAC imports (matches legacy template pattern)
sys.path.append(str(Path(__file__).parent.parent.parent))

from spac.transform.clustering.phenograph import phenograph_cpu
from spac.templates.template_utils import (
load_input,
parse_params,
save_results,
text_to_value,
)


def run_from_json(
json_path: Union[str, Path, Dict[str, Any]],
save_to_disk: bool = True,
output_dir: str = None,
):
"""
Execute CPU PhenoGraph Clustering from Galaxy JSON parameters.

Mirrors the legacy template's run_from_json contract for drop-in
replacement, minus the dead HPC fields.
"""
params = parse_params(json_path)

if output_dir is None:
output_dir = params.get("Output_Directory", ".")

if "outputs" not in params:
params["outputs"] = {
"analysis": {"type": "file", "name": "output.pickle"}
}

# Load AnnData
adata = load_input(params["Upstream_Analysis"])

# Extract parameters
layer = params.get("Table_to_Process", "Original")
if layer == "Original":
layer = None

k = int(params.get("K_Nearest_Neighbors", 30))
seed = int(params.get("Seed", 42))
resolution = float(params.get("Resolution_Parameter", 1.0))
output_name = params.get("Output_Annotation_Name", "phenograph")
n_iterations = int(params.get("Number_of_Iterations", 100))

print("Before PhenoGraph CPU Clustering:\n", adata)

phenograph_cpu(
adata=adata,
features=None, # use all var index
layer=layer,
k=k,
seed=seed,
resolution_parameter=resolution,
n_iterations=n_iterations,
output_annotation=output_name,
)

print(f'Count of cells in the output annotation "{output_name}":')
print(adata.obs[output_name].value_counts())
print("\nAfter PhenoGraph CPU Clustering:\n", adata)

if save_to_disk:
saved_files = save_results(
results={"analysis": adata},
params=params,
output_base_dir=output_dir,
)
print(
f"PhenoGraph CPU Clustering completed -> {saved_files['analysis']}"
)
return saved_files

return adata


if __name__ == "__main__":
if len(sys.argv) < 2:
print(
"Usage: python phenograph_clustering_cpu_template.py "
"<params.json> [output_dir]",
file=sys.stderr,
)
sys.exit(1)

output_dir = sys.argv[2] if len(sys.argv) > 2 else None
result = run_from_json(json_path=sys.argv[1], output_dir=output_dir)

if isinstance(result, dict):
print("\nOutput files:")
for name, path in result.items():
print(f" {name}: {path}")
else:
print("\nReturned AnnData object")
146 changes: 146 additions & 0 deletions src/spac/templates/phenograph_clustering_gpu_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""
Galaxy GPU template for PhenoGraph clustering.

Calls ``spac.transform.clustering.phenograph.gpu.grapheno.phenograph_gpu``
inside the GPU Docker container (``nciccbr/spac-gpu``). Supports profiling
mode, which runs Leiden at multiple resolutions sharing the same KNN and
Jaccard computation (a grapheno feature exposed via the parquet cache in
the job work directory).

Usage
-----
>>> from spac.templates.phenograph_clustering_gpu_template import run_from_json
>>> run_from_json("examples/phenograph_clustering_gpu_params.json")
"""
import sys
from pathlib import Path
from typing import Any, Dict, Union

sys.path.append(str(Path(__file__).parent.parent.parent))

from spac.transform.clustering.phenograph import phenograph_gpu
from spac.templates.template_utils import (
load_input,
parse_params,
save_results,
text_to_value,
)


def _resolutions(params):
profiling = bool(params.get("Profiling", False))
res_list = params.get("Resolution_List") or []
res_param = params.get("Resolution_Parameter", 1.0)

if profiling:
if not res_list:
raise ValueError(
"Profiling=True but Resolution_List is empty."
)
return True, [float(r) for r in res_list]
return False, [float(res_param)]


def run_from_json(
json_path: Union[str, Path, Dict[str, Any]],
save_to_disk: bool = True,
output_dir: str = None,
):
"""
Execute GPU PhenoGraph Clustering from Galaxy JSON parameters.
"""
if phenograph_gpu is None:
raise RuntimeError(
"GPU backend not available. Ensure RAPIDS (cuml, cugraph) is "
"installed and this template is running inside the GPU Docker "
"image (nciccbr/spac-gpu)."
)

params = parse_params(json_path)

if output_dir is None:
output_dir = params.get("Output_Directory", ".")

if "outputs" not in params:
params["outputs"] = {
"analysis": {"type": "file", "name": "output.pickle"}
}

# Load AnnData
adata = load_input(params["Upstream_Analysis"])

layer = params.get("Table_to_Process", "Original")
if layer == "Original":
layer = None

k = int(params.get("K_Nearest_Neighbors", 30))
seed = int(params.get("Seed", 42))
output_name = params.get("Output_Annotation_Name", "phenograph")
n_iterations = int(params.get("Number_of_Iterations", 100))

profiling, resolutions = _resolutions(params)

print("Before PhenoGraph GPU Clustering:\n", adata)
print(f"Mode: {'profiling' if profiling else 'single'}, resolutions={resolutions}")

# In profiling mode, the first resolution's call populates the parquet cache
# (KNN + Jaccard); subsequent calls only re-run Leiden. The cache lives in
# the process cwd, which Galaxy sets to the per-job work directory.
for res in resolutions:
col_name = (
f"{output_name}-{k}-{res}" if profiling else output_name
)
phenograph_gpu(
adata=adata,
features=None,
layer=layer,
k=k,
seed=seed,
resolution_parameter=res,
n_iterations=n_iterations,
output_annotation=col_name,
)
n_clusters = int(adata.obs[col_name].nunique())
print(f" {col_name}: {n_clusters} clusters")

# Summary column for provenance (single-resolution mode uses output_name;
# profiling mode uses the last resolution as a reference).
summary_col = (
f"{output_name}-{k}-{resolutions[-1]}" if profiling else output_name
)
print(f'\nCount of cells in "{summary_col}":')
print(adata.obs[summary_col].value_counts())
print("\nAfter PhenoGraph GPU Clustering:\n", adata)

if save_to_disk:
saved_files = save_results(
results={"analysis": adata},
params=params,
output_base_dir=output_dir,
)
print(
f"PhenoGraph GPU Clustering completed -> {saved_files['analysis']}"
)
return saved_files

return adata


if __name__ == "__main__":
if len(sys.argv) < 2:
print(
"Usage: python phenograph_clustering_gpu_template.py "
"<params.json> [output_dir]",
file=sys.stderr,
)
sys.exit(1)

output_dir = sys.argv[2] if len(sys.argv) > 2 else None
result = run_from_json(json_path=sys.argv[1], output_dir=output_dir)

if isinstance(result, dict):
print("\nOutput files:")
for name, path in result.items():
print(f" {name}: {path}")
else:
print("\nReturned AnnData object")
5 changes: 4 additions & 1 deletion src/spac/templates/umap_transformation_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ def run_from_json(
# Load the upstream analysis data
adata = load_input(params["Upstream_Analysis"])

# Extract parameters - Note: HPC parameters are ignored in SPAC version
# Extract parameters. HPC parameters (Run_on_HPC, Batch_Mode, Partition,
# Number_of_CPUs, Memory_GB, Request_Time) were removed from the Galaxy
# tool in v2.1.0 and are no longer sent. params.get() is kept for safety
# so older JSON blueprints containing those keys still load cleanly.
n_neighbors = params.get("Number_of_Neighbors", 75)
min_dist = params.get("Minimum_Distance_between_Points", 0.1)
n_components = params.get("Target_Dimension_Number", 2)
Expand Down
7 changes: 7 additions & 0 deletions src/spac/transform/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
spac.transform — transformation algorithms, organized by type.

This module is a new top-level namespace per the April 17 refactoring decision.
For now, only phenograph clustering lives here. The rest of the transformations
continue to live in ``spac.transformations`` and will migrate gradually.
"""
3 changes: 3 additions & 0 deletions src/spac/transform/clustering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
spac.transform.clustering — clustering algorithms.
"""
37 changes: 37 additions & 0 deletions src/spac/transform/clustering/phenograph/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
spac.transform.clustering.phenograph — PhenoGraph clustering (CPU and GPU).

Public entry points
-------------------
prepare_features(adata, layer, features)
Shared preprocessing: extract a dense float32 feature matrix from AnnData.
Used by BOTH cpu and gpu paths to guarantee identical input.

phenograph_cpu(adata, features, layer, k, seed, ...)
CPU PhenoGraph via the `phenograph` Python package.
INDEPENDENT of spac.transformations.phenograph_clustering.

phenograph_gpu(adata, features, layer, k, seed, ...)
GPU PhenoGraph via grapheno (RAPIDS cuml + cugraph).
Only importable inside the GPU container; returns None in CPU-only envs.

Notes
-----
This module is a deliberate parallel to the Biowulf ``spac_phenograph/CPU/``
and ``spac_phenograph/GPU/`` folder split. Each path is a standalone
implementation; neither wraps the other. The legacy
``spac.transformations.phenograph_clustering`` function is untouched and
continues to serve the legacy ``phenograph_clustering.xml`` Galaxy tool.
"""

from .preprocess import prepare_features
from .cpu import phenograph_cpu

# GPU import is deferred. Import failures in CPU-only environments must not
# break ``from spac.transform.clustering.phenograph import phenograph_cpu``.
try:
from .gpu.grapheno import phenograph_gpu
except ImportError:
phenograph_gpu = None

__all__ = ["prepare_features", "phenograph_cpu", "phenograph_gpu"]
Loading
Loading