diff --git a/scripts/eval_scripts/CIFAR10/baseline.slrm b/scripts/eval_scripts/CIFAR10/baseline.slrm deleted file mode 100644 index 3ecb7a4..0000000 --- a/scripts/eval_scripts/CIFAR10/baseline.slrm +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="cifar" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=slurm-cifar10_baseline_160_%j.out -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar10" \ - --num_classes=10 diff --git a/scripts/eval_scripts/CIFAR10/icgan.slrm b/scripts/eval_scripts/CIFAR10/icgan.slrm deleted file mode 100644 index f4bf503..0000000 --- a/scripts/eval_scripts/CIFAR10/icgan.slrm +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="cifar" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=slurm-cifar10_baseline_160_%j.out -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar10" \ - --num_classes=10 diff --git a/scripts/eval_scripts/CIFAR10/stablediff.slrm b/scripts/eval_scripts/CIFAR10/stablediff.slrm deleted file mode 100644 index 64361fb..0000000 --- a/scripts/eval_scripts/CIFAR10/stablediff.slrm +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="cifar" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=slurm-cifar10_baseline_160_%j.out -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar10" \ - --num_classes=10 diff --git a/scripts/eval_scripts/CIFAR100/baseline.slrm b/scripts/eval_scripts/CIFAR100/baseline.slrm deleted file mode 100644 index a68be76..0000000 --- a/scripts/eval_scripts/CIFAR100/baseline.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="cifar" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=cifar100_baseline_160_%j.out -#SBATCH --error=cifar100_baseline_160_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar100" \ - --num_classes=100 \ No newline at end of file diff --git a/scripts/eval_scripts/CIFAR100/icgan.slrm b/scripts/eval_scripts/CIFAR100/icgan.slrm deleted file mode 100644 index 98a2125..0000000 --- a/scripts/eval_scripts/CIFAR100/icgan.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="cifar" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=cifar100_baseline_160_%j.out -#SBATCH --error=cifar100_baseline_160_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar100" \ - --num_classes=100 \ No newline at end of file diff --git a/scripts/eval_scripts/CIFAR100/stablediff.slrm b/scripts/eval_scripts/CIFAR100/stablediff.slrm deleted file mode 100644 index 9f6d928..0000000 --- a/scripts/eval_scripts/CIFAR100/stablediff.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="cifar" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=cifar100_baseline_160_%j.out -#SBATCH --error=cifar100_baseline_160_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar100" \ - --num_classes=100 \ No newline at end of file diff --git a/scripts/eval_scripts/INaturalist/baseline.slrm b/scripts/eval_scripts/INaturalist/baseline.slrm deleted file mode 100644 index e68bef7..0000000 --- a/scripts/eval_scripts/INaturalist/baseline.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="inaturalist" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=inaturalist_baseline_%j.out -#SBATCH --error=inaturalist_baseline_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/datasets/inat_comp/2018/" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="INaturalist" \ - --num_classes=8142 \ No newline at end of file diff --git a/scripts/eval_scripts/INaturalist/icgan.slrm b/scripts/eval_scripts/INaturalist/icgan.slrm deleted file mode 100644 index 2341e6f..0000000 --- a/scripts/eval_scripts/INaturalist/icgan.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="inaturalist" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=inaturalist_baseline_%j.out -#SBATCH --error=inaturalist_baseline_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/datasets/inat_comp/2018/" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="INaturalist" \ - --num_classes=8142 \ No newline at end of file diff --git a/scripts/eval_scripts/INaturalist/stablediff.slrm b/scripts/eval_scripts/INaturalist/stablediff.slrm deleted file mode 100644 index 29f1159..0000000 --- a/scripts/eval_scripts/INaturalist/stablediff.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="inaturalist" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=inaturalist_baseline_%j.out -#SBATCH --error=inaturalist_baseline_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/datasets/inat_comp/2018/" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="INaturalist" \ - --num_classes=8142 \ No newline at end of file diff --git a/scripts/eval_scripts/food101/baseline.slrm b/scripts/eval_scripts/food101/baseline.slrm deleted file mode 100644 index f9f5fdf..0000000 --- a/scripts/eval_scripts/food101/baseline.slrm +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="food101" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=slurm-food101_baseline_160_%j.out -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="food101" \ - --num_classes=101 \ No newline at end of file diff --git a/scripts/eval_scripts/food101/icgan.slrm b/scripts/eval_scripts/food101/icgan.slrm deleted file mode 100644 index c31f3a5..0000000 --- a/scripts/eval_scripts/food101/icgan.slrm +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="food101" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=slurm-food101_baseline_160_%j.out -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="P/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="food101" \ - --num_classes=101 \ No newline at end of file diff --git a/scripts/eval_scripts/food101/stablediff.slrm b/scripts/eval_scripts/food101/stablediff.slrm deleted file mode 100644 index a30522b..0000000 --- a/scripts/eval_scripts/food101/stablediff.slrm +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="food101" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=slurm-food101_baseline_160_%j.out -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="food101" \ - --num_classes=101 \ No newline at end of file diff --git a/scripts/eval_scripts/imagenet/baseline.slrm b/scripts/eval_scripts/imagenet/baseline.slrm deleted file mode 100644 index 11417ec..0000000 --- a/scripts/eval_scripts/imagenet/baseline.slrm +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="imagenet_eval" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=imagenet_baseline_%j.out -#SBATCH --error=imagenet_baseline_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/scratch/ssd004/datasets/imagenet256" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars --batch-size=2048 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar" \ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" diff --git a/scripts/eval_scripts/imagenet/icgan.slrm b/scripts/eval_scripts/imagenet/icgan.slrm deleted file mode 100644 index e68050d..0000000 --- a/scripts/eval_scripts/imagenet/icgan.slrm +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="imagenet_eval" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=imagenet_baseline_%j.out -#SBATCH --error=imagenet_baseline_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/scratch/ssd004/datasets/imagenet256" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars --batch-size=2048 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar" \ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" diff --git a/scripts/eval_scripts/imagenet/stablediff.slrm b/scripts/eval_scripts/imagenet/stablediff.slrm deleted file mode 100644 index 37c85c5..0000000 --- a/scripts/eval_scripts/imagenet/stablediff.slrm +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="imagenet_eval" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=imagenet_baseline_%j.out -#SBATCH --error=imagenet_baseline_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/scratch/ssd004/datasets/imagenet256" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars --batch-size=2048 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar" \ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" diff --git a/scripts/eval_scripts/places365/baseline.slrm b/scripts/eval_scripts/places365/baseline.slrm deleted file mode 100644 index a619037..0000000 --- a/scripts/eval_scripts/places365/baseline.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="places365" -#SBATCH --partition=rtx6000 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=places365_baseline_160_%j.out -#SBATCH --error=places365_baseline_160_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets/places365" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="places365" \ - --num_classes=434 \ No newline at end of file diff --git a/scripts/eval_scripts/places365/icgan.slrm b/scripts/eval_scripts/places365/icgan.slrm deleted file mode 100644 index 84a9317..0000000 --- a/scripts/eval_scripts/places365/icgan.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="places365" -#SBATCH --partition=rtx6000 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=places365_baseline_160_%j.out -#SBATCH --error=places365_baseline_160_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets/places365" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="places365" \ - --num_classes=434 \ No newline at end of file diff --git a/scripts/eval_scripts/places365/stablediff.slrm b/scripts/eval_scripts/places365/stablediff.slrm deleted file mode 100644 index 8985fae..0000000 --- a/scripts/eval_scripts/places365/stablediff.slrm +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="places365" -#SBATCH --partition=rtx6000 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=places365_baseline_160_%j.out -#SBATCH --error=places365_baseline_160_%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -echo $MASTER_ADDR -echo $MASTER_PORT - -export PYTHONPATH="." -nvidia-smi - -python simsiam/linear_eval.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets/places365" \ - --arch="resnet50" \ - --multiprocessing-distributed \ - --lars \ - --batch-size=4096 \ - --epochs=100 \ - -j=16 \ - --world-size 1 \ - --rank 0 \ - --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar"\ - --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="places365" \ - --num_classes=434 \ No newline at end of file diff --git a/scripts/generation_scripts/gen_img_icgan.slrm b/scripts/generation_scripts/gen_img_icgan.slrm index 6741e60..2e9e29e 100644 --- a/scripts/generation_scripts/gen_img_icgan.slrm +++ b/scripts/generation_scripts/gen_img_icgan.slrm @@ -14,7 +14,7 @@ PY_ARGS=${@:1} # activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate +source YOUR_VENV_PATH/bin/activate export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend export CUDA_LAUNCH_BLOCKING=1 @@ -26,7 +26,7 @@ export PYTHONPATH="." nvidia-smi srun python data_generation/img2img_icgan.py \ ---outdir /projects/imagenet_synthetic/synthetic_icgan \ +--outdir SAVE_DIR \ --num_shards=7 \ --shard_index=2 \ --image_version=1 \ diff --git a/scripts/generation_scripts/gen_img_stablediff.slrm b/scripts/generation_scripts/gen_img_stablediff.slrm index 6113e05..87852af 100644 --- a/scripts/generation_scripts/gen_img_stablediff.slrm +++ b/scripts/generation_scripts/gen_img_stablediff.slrm @@ -14,7 +14,7 @@ PY_ARGS=${@:1} # activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate +source YOUR_VENV_PATH/bin/activate export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend export CUDA_LAUNCH_BLOCKING=1 @@ -26,7 +26,7 @@ export PYTHONPATH="." nvidia-smi srun python data_generation/img2img_stable_diff.py \ ---outdir /projects/imagenet_synthetic/arashaf_stablediff_batched \ +--outdir SAVE_DIR \ --num_shards=7 \ --shard_index=2 \ --image_version=1 \ diff --git a/scripts/solo_learn/eval_solo_learn.slrm b/scripts/solo_learn/eval_solo_learn.slrm index 51a5731..0a3666b 100644 --- a/scripts/solo_learn/eval_solo_learn.slrm +++ b/scripts/solo_learn/eval_solo_learn.slrm @@ -1,8 +1,7 @@ #!/bin/bash -#SBATCH --job-name="eval_simsiam_single" -#SBATCH --partition=a40 -#SBATCH --qos=a40_arashaf +#SBATCH --job-name="eval_simclr_single" +#SBATCH --qos=m #SBATCH --nodes=1 #SBATCH --gres=gpu:a40:4 #SBATCH --ntasks-per-node=4 @@ -15,7 +14,7 @@ #SBATCH --time=12:00:00 # load virtual environment -source /ssd003/projects/aieng/envs/genssl3/bin/activate +source YOUR_VENV_PATH/bin/activate export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend diff --git a/scripts/solo_learn/train_solo_learn.slrm b/scripts/solo_learn/train_solo_learn.slrm index fbe9102..9ac8d76 100644 --- a/scripts/solo_learn/train_solo_learn.slrm +++ b/scripts/solo_learn/train_solo_learn.slrm @@ -1,8 +1,7 @@ #!/bin/bash #SBATCH --job-name="simclr_single_train" -#SBATCH --partition=a40 -#SBATCH --qos=a40_arashaf +#SBATCH --qos=m #SBATCH --nodes=1 #SBATCH --gres=gpu:a40:4 #SBATCH --ntasks-per-node=4 @@ -15,7 +14,7 @@ #SBATCH --time=72:00:00 # load virtual environment -source /ssd003/projects/aieng/envs/genssl3/bin/activate +source YOUR_VENV_PATH/bin/activate export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend diff --git a/scripts/solo_learn/train_synth_solo_learn.slrm b/scripts/solo_learn/train_synth_solo_learn.slrm deleted file mode 100644 index 4f11386..0000000 --- a/scripts/solo_learn/train_synth_solo_learn.slrm +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="simclr_single_train" -#SBATCH --partition=a40 -#SBATCH --qos=a40_arashaf -#SBATCH --nodes=1 -#SBATCH --gres=gpu:a40:4 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=8 -#SBATCH --mem=0 -#SBATCH --output=singlenode-%j.out -#SBATCH --error=singlenode-%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=72:00:00 - -# load virtual environment -source /ssd003/projects/aieng/envs/genssl3/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend - -export PYTHONPATH="." -nvidia-smi - -torchrun --nproc-per-node=4 --nnodes=1 solo-learn/main_pretrain.py \ - --config-path scripts/pretrain/imagenet/ \ - --config-name simclr_synthetic.yaml \ No newline at end of file diff --git a/scripts/train_scrpits/train_simsiam_multinode.slrm b/scripts/train_scrpits/train_simsiam_multinode.slrm deleted file mode 100644 index 0d4d55c..0000000 --- a/scripts/train_scrpits/train_simsiam_multinode.slrm +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="simsiam_multi_train" -#SBATCH --partition=a40 -#SBATCH --account=deadline -#SBATCH --qos=deadline -#SBATCH --nodes=2 -#SBATCH --gres=gpu:a40:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=01:00:00 -#SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=8G -#SBATCH --output=slurm-%j.out -#SBATCH --error=slurm-%j.err -# load virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 - - -export MASTER_ADDR="$(hostname --fqdn)" -export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" -export RDVZ_ID=$RANDOM -echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" - -export PYTHONPATH="." -nvidia-smi - -srun -p $SLURM_JOB_PARTITION \ - -c $SLURM_CPUS_ON_NODE \ - -N $SLURM_JOB_NUM_NODES \ - --mem=0 \ - --gres=gpu:$SLURM_JOB_PARTITION:$SLURM_GPUS_ON_NODE \ - bash -c 'torchrun \ - --nproc-per-node=$SLURM_GPUS_ON_NODE \ - --nnodes=$SLURM_JOB_NUM_NODES \ - --rdzv-endpoint $MASTER_ADDR:$MASTER_PORT \ - --rdzv-id $RDVZ_ID \ - --rdzv-backend c10d \ - simsiam/train_simsiam.py.py \ - -a resnet50 \ - --fix-pred-lr \ - --distributed_mode \ - --batch-size=128 \ - --epochs=200 \ - --experiment="simsiam_icgan_seed43_bs128_rforig" \ - --resume_from_checkpoint="/projects/imagenet_synthetic/model_checkpoints/_original_simsiam/checkpoint_0099.pth.tar" \ - --seed=43 \ - --use_synthetic_data \ - --synthetic_data_dir="/projects/imagenet_synthetic/synthetic_icgan" \ - --synthetic_index_min=0 \ - --synthetic_index_max=4 \ - --generative_augmentation_prob=0.5' \ No newline at end of file diff --git a/scripts/train_scrpits/train_simsiam_singlenode.slrm b/scripts/train_scrpits/train_simsiam_singlenode.slrm deleted file mode 100644 index 4be266e..0000000 --- a/scripts/train_scrpits/train_simsiam_singlenode.slrm +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name="simsiam_single_train" -#SBATCH --partition=a40 -#SBATCH --qos=deadline -#SBATCH --account=deadline -#SBATCH --nodes=1 -#SBATCH --gres=gpu:a40:4 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=singlenode-%j.out -#SBATCH --error=singlenode-%j.err -#SBATCH --open-mode=append -#SBATCH --wait-all-nodes=1 -#SBATCH --time=12:00:00 - -# activate virtual environment -source /ssd003/projects/aieng/envs/genssl2/bin/activate - -export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. -export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend -# export CUDA_LAUNCH_BLOCKING=1 - -export PYTHONPATH="." -nvidia-smi - -torchrun --nproc-per-node=4 --nnodes=1 simsiam/train_simsiam.py \ - -a resnet50 \ - --fix-pred-lr \ - --distributed_mode \ - --batch-size=128 \ - --epochs=100 \ - --experiment="simsiam_stablediff_p0p5_seed43" \ - --resume_from_checkpoint="" \ - --seed=43 \ - --use_synthetic_data \ - --synthetic_data_dir="/projects/imagenet_synthetic/arashaf_stablediff_batched" \ - --synthetic_index_min=0 \ - --synthetic_index_max=9 \ - --generative_augmentation_prob=0.5 \ No newline at end of file diff --git a/simsiam/LARC.py b/simsiam/LARC.py deleted file mode 100644 index fe41b13..0000000 --- a/simsiam/LARC.py +++ /dev/null @@ -1,107 +0,0 @@ -import torch -from torch import nn -from torch.nn.parameter import Parameter - - -class LARC(object): - """ - :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC, - in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive - local learning rate for each individual parameter. The algorithm is designed to improve - convergence of large batch training. - - See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate. - In practice it modifies the gradients of parameters as a proxy for modifying the learning rate - of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer. - ``` - model = ... - optim = torch.optim.Adam(model.parameters(), lr=...) - optim = LARC(optim) - ``` - It can even be used in conjunction with apex.fp16_utils.FP16_optimizer. - ``` - model = ... - optim = torch.optim.Adam(model.parameters(), lr=...) - optim = LARC(optim) - optim = apex.fp16_utils.FP16_Optimizer(optim) - ``` - Args: - optimizer: Pytorch optimizer to wrap and modify learning rate for. - trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888 - clip: Decides between clipping or scaling mode of LARC. If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter. If `clip=False` the learning rate is set to `local_lr*optimizer_lr`. - eps: epsilon kludge to help with numerical stability while calculating adaptive_lr - """ - - def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8): - self.optim = optimizer - self.trust_coefficient = trust_coefficient - self.eps = eps - self.clip = clip - - def __getstate__(self): - return self.optim.__getstate__() - - def __setstate__(self, state): - self.optim.__setstate__(state) - - @property - def state(self): - return self.optim.state - - def __repr__(self): - return self.optim.__repr__() - - @property - def param_groups(self): - return self.optim.param_groups - - @param_groups.setter - def param_groups(self, value): - self.optim.param_groups = value - - def state_dict(self): - return self.optim.state_dict() - - def load_state_dict(self, state_dict): - self.optim.load_state_dict(state_dict) - - def zero_grad(self): - self.optim.zero_grad() - - def add_param_group(self, param_group): - self.optim.add_param_group(param_group) - - def step(self): - with torch.no_grad(): - weight_decays = [] - for group in self.optim.param_groups: - # absorb weight decay control from optimizer - weight_decay = group["weight_decay"] if "weight_decay" in group else 0 - weight_decays.append(weight_decay) - group["weight_decay"] = 0 - for p in group["params"]: - if p.grad is None: - continue - param_norm = torch.norm(p.data) - grad_norm = torch.norm(p.grad.data) - - if param_norm != 0 and grad_norm != 0: - # calculate adaptive lr + weight decay - adaptive_lr = ( - self.trust_coefficient - * (param_norm) - / (grad_norm + param_norm * weight_decay + self.eps) - ) - - # clip learning rate for LARC - if self.clip: - # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)` - adaptive_lr = min(adaptive_lr / group["lr"], 1) - - p.grad.data += weight_decay * p.data - p.grad.data *= adaptive_lr - - self.optim.step() - # return weight decay control to optimizer - for i, group in enumerate(self.optim.param_groups): - group["weight_decay"] = weight_decays[i] diff --git a/simsiam/LICENSE b/simsiam/LICENSE deleted file mode 100644 index 105a4fb..0000000 --- a/simsiam/LICENSE +++ /dev/null @@ -1,399 +0,0 @@ -Attribution-NonCommercial 4.0 International - -======================================================================= - -Creative Commons Corporation ("Creative Commons") is not a law firm and -does not provide legal services or legal advice. Distribution of -Creative Commons public licenses does not create a lawyer-client or -other relationship. Creative Commons makes its licenses and related -information available on an "as-is" basis. Creative Commons gives no -warranties regarding its licenses, any material licensed under their -terms and conditions, or any related information. Creative Commons -disclaims all liability for damages resulting from their use to the -fullest extent possible. - -Using Creative Commons Public Licenses - -Creative Commons public licenses provide a standard set of terms and -conditions that creators and other rights holders may use to share -original works of authorship and other material subject to copyright -and certain other rights specified in the public license below. The -following considerations are for informational purposes only, are not -exhaustive, and do not form part of our licenses. - - Considerations for licensors: Our public licenses are - intended for use by those authorized to give the public - permission to use material in ways otherwise restricted by - copyright and certain other rights. Our licenses are - irrevocable. Licensors should read and understand the terms - and conditions of the license they choose before applying it. - Licensors should also secure all rights necessary before - applying our licenses so that the public can reuse the - material as expected. Licensors should clearly mark any - material not subject to the license. This includes other CC- - licensed material, or material used under an exception or - limitation to copyright. More considerations for licensors: - wiki.creativecommons.org/Considerations_for_licensors - - Considerations for the public: By using one of our public - licenses, a licensor grants the public permission to use the - licensed material under specified terms and conditions. If - the licensor's permission is not necessary for any reason--for - example, because of any applicable exception or limitation to - copyright--then that use is not regulated by the license. Our - licenses grant only permissions under copyright and certain - other rights that a licensor has authority to grant. Use of - the licensed material may still be restricted for other - reasons, including because others have copyright or other - rights in the material. A licensor may make special requests, - such as asking that all changes be marked or described. - Although not required by our licenses, you are encouraged to - respect those requests where reasonable. More_considerations - for the public: - wiki.creativecommons.org/Considerations_for_licensees - -======================================================================= - -Creative Commons Attribution-NonCommercial 4.0 International Public -License - -By exercising the Licensed Rights (defined below), You accept and agree -to be bound by the terms and conditions of this Creative Commons -Attribution-NonCommercial 4.0 International Public License ("Public -License"). To the extent this Public License may be interpreted as a -contract, You are granted the Licensed Rights in consideration of Your -acceptance of these terms and conditions, and the Licensor grants You -such rights in consideration of benefits the Licensor receives from -making the Licensed Material available under these terms and -conditions. - -Section 1 -- Definitions. - - a. Adapted Material means material subject to Copyright and Similar - Rights that is derived from or based upon the Licensed Material - and in which the Licensed Material is translated, altered, - arranged, transformed, or otherwise modified in a manner requiring - permission under the Copyright and Similar Rights held by the - Licensor. For purposes of this Public License, where the Licensed - Material is a musical work, performance, or sound recording, - Adapted Material is always produced where the Licensed Material is - synched in timed relation with a moving image. - - b. Adapter's License means the license You apply to Your Copyright - and Similar Rights in Your contributions to Adapted Material in - accordance with the terms and conditions of this Public License. - - c. Copyright and Similar Rights means copyright and/or similar rights - closely related to copyright including, without limitation, - performance, broadcast, sound recording, and Sui Generis Database - Rights, without regard to how the rights are labeled or - categorized. For purposes of this Public License, the rights - specified in Section 2(b)(1)-(2) are not Copyright and Similar - Rights. - d. Effective Technological Measures means those measures that, in the - absence of proper authority, may not be circumvented under laws - fulfilling obligations under Article 11 of the WIPO Copyright - Treaty adopted on December 20, 1996, and/or similar international - agreements. - - e. Exceptions and Limitations means fair use, fair dealing, and/or - any other exception or limitation to Copyright and Similar Rights - that applies to Your use of the Licensed Material. - - f. Licensed Material means the artistic or literary work, database, - or other material to which the Licensor applied this Public - License. - - g. Licensed Rights means the rights granted to You subject to the - terms and conditions of this Public License, which are limited to - all Copyright and Similar Rights that apply to Your use of the - Licensed Material and that the Licensor has authority to license. - - h. Licensor means the individual(s) or entity(ies) granting rights - under this Public License. - - i. NonCommercial means not primarily intended for or directed towards - commercial advantage or monetary compensation. For purposes of - this Public License, the exchange of the Licensed Material for - other material subject to Copyright and Similar Rights by digital - file-sharing or similar means is NonCommercial provided there is - no payment of monetary compensation in connection with the - exchange. - - j. Share means to provide material to the public by any means or - process that requires permission under the Licensed Rights, such - as reproduction, public display, public performance, distribution, - dissemination, communication, or importation, and to make material - available to the public including in ways that members of the - public may access the material from a place and at a time - individually chosen by them. - - k. Sui Generis Database Rights means rights other than copyright - resulting from Directive 96/9/EC of the European Parliament and of - the Council of 11 March 1996 on the legal protection of databases, - as amended and/or succeeded, as well as other essentially - equivalent rights anywhere in the world. - - l. You means the individual or entity exercising the Licensed Rights - under this Public License. Your has a corresponding meaning. - -Section 2 -- Scope. - - a. License grant. - - 1. Subject to the terms and conditions of this Public License, - the Licensor hereby grants You a worldwide, royalty-free, - non-sublicensable, non-exclusive, irrevocable license to - exercise the Licensed Rights in the Licensed Material to: - - a. reproduce and Share the Licensed Material, in whole or - in part, for NonCommercial purposes only; and - - b. produce, reproduce, and Share Adapted Material for - NonCommercial purposes only. - - 2. Exceptions and Limitations. For the avoidance of doubt, where - Exceptions and Limitations apply to Your use, this Public - License does not apply, and You do not need to comply with - its terms and conditions. - - 3. Term. The term of this Public License is specified in Section - 6(a). - - 4. Media and formats; technical modifications allowed. The - Licensor authorizes You to exercise the Licensed Rights in - all media and formats whether now known or hereafter created, - and to make technical modifications necessary to do so. The - Licensor waives and/or agrees not to assert any right or - authority to forbid You from making technical modifications - necessary to exercise the Licensed Rights, including - technical modifications necessary to circumvent Effective - Technological Measures. For purposes of this Public License, - simply making modifications authorized by this Section 2(a) - (4) never produces Adapted Material. - - 5. Downstream recipients. - - a. Offer from the Licensor -- Licensed Material. Every - recipient of the Licensed Material automatically - receives an offer from the Licensor to exercise the - Licensed Rights under the terms and conditions of this - Public License. - - b. No downstream restrictions. You may not offer or impose - any additional or different terms or conditions on, or - apply any Effective Technological Measures to, the - Licensed Material if doing so restricts exercise of the - Licensed Rights by any recipient of the Licensed - Material. - - 6. No endorsement. Nothing in this Public License constitutes or - may be construed as permission to assert or imply that You - are, or that Your use of the Licensed Material is, connected - with, or sponsored, endorsed, or granted official status by, - the Licensor or others designated to receive attribution as - provided in Section 3(a)(1)(A)(i). - - b. Other rights. - - 1. Moral rights, such as the right of integrity, are not - licensed under this Public License, nor are publicity, - privacy, and/or other similar personality rights; however, to - the extent possible, the Licensor waives and/or agrees not to - assert any such rights held by the Licensor to the limited - extent necessary to allow You to exercise the Licensed - Rights, but not otherwise. - - 2. Patent and trademark rights are not licensed under this - Public License. - - 3. To the extent possible, the Licensor waives any right to - collect royalties from You for the exercise of the Licensed - Rights, whether directly or through a collecting society - under any voluntary or waivable statutory or compulsory - licensing scheme. In all other cases the Licensor expressly - reserves any right to collect such royalties, including when - the Licensed Material is used other than for NonCommercial - purposes. - -Section 3 -- License Conditions. - -Your exercise of the Licensed Rights is expressly made subject to the -following conditions. - - a. Attribution. - - 1. If You Share the Licensed Material (including in modified - form), You must: - - a. retain the following if it is supplied by the Licensor - with the Licensed Material: - - i. identification of the creator(s) of the Licensed - Material and any others designated to receive - attribution, in any reasonable manner requested by - the Licensor (including by pseudonym if - designated); - - ii. a copyright notice; - - iii. a notice that refers to this Public License; - - iv. a notice that refers to the disclaimer of - warranties; - - v. a URI or hyperlink to the Licensed Material to the - extent reasonably practicable; - - b. indicate if You modified the Licensed Material and - retain an indication of any previous modifications; and - - c. indicate the Licensed Material is licensed under this - Public License, and include the text of, or the URI or - hyperlink to, this Public License. - - 2. You may satisfy the conditions in Section 3(a)(1) in any - reasonable manner based on the medium, means, and context in - which You Share the Licensed Material. For example, it may be - reasonable to satisfy the conditions by providing a URI or - hyperlink to a resource that includes the required - information. - - 3. If requested by the Licensor, You must remove any of the - information required by Section 3(a)(1)(A) to the extent - reasonably practicable. - - 4. If You Share Adapted Material You produce, the Adapter's - License You apply must not prevent recipients of the Adapted - Material from complying with this Public License. - -Section 4 -- Sui Generis Database Rights. - -Where the Licensed Rights include Sui Generis Database Rights that -apply to Your use of the Licensed Material: - - a. for the avoidance of doubt, Section 2(a)(1) grants You the right - to extract, reuse, reproduce, and Share all or a substantial - portion of the contents of the database for NonCommercial purposes - only; - - b. if You include all or a substantial portion of the database - contents in a database in which You have Sui Generis Database - Rights, then the database in which You have Sui Generis Database - Rights (but not its individual contents) is Adapted Material; and - - c. You must comply with the conditions in Section 3(a) if You Share - all or a substantial portion of the contents of the database. - -For the avoidance of doubt, this Section 4 supplements and does not -replace Your obligations under this Public License where the Licensed -Rights include other Copyright and Similar Rights. - -Section 5 -- Disclaimer of Warranties and Limitation of Liability. - - a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE - EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS - AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF - ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, - IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, - WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR - PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, - ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT - KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT - ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. - - b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE - TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, - NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, - INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, - COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR - USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN - ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR - DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR - IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. - - c. The disclaimer of warranties and limitation of liability provided - above shall be interpreted in a manner that, to the extent - possible, most closely approximates an absolute disclaimer and - waiver of all liability. - -Section 6 -- Term and Termination. - - a. This Public License applies for the term of the Copyright and - Similar Rights licensed here. However, if You fail to comply with - this Public License, then Your rights under this Public License - terminate automatically. - - b. Where Your right to use the Licensed Material has terminated under - Section 6(a), it reinstates: - - 1. automatically as of the date the violation is cured, provided - it is cured within 30 days of Your discovery of the - violation; or - - 2. upon express reinstatement by the Licensor. - - For the avoidance of doubt, this Section 6(b) does not affect any - right the Licensor may have to seek remedies for Your violations - of this Public License. - - c. For the avoidance of doubt, the Licensor may also offer the - Licensed Material under separate terms or conditions or stop - distributing the Licensed Material at any time; however, doing so - will not terminate this Public License. - - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public - License. - -Section 7 -- Other Terms and Conditions. - - a. The Licensor shall not be bound by any additional or different - terms or conditions communicated by You unless expressly agreed. - - b. Any arrangements, understandings, or agreements regarding the - Licensed Material not stated herein are separate from and - independent of the terms and conditions of this Public License. - -Section 8 -- Interpretation. - - a. For the avoidance of doubt, this Public License does not, and - shall not be interpreted to, reduce, limit, restrict, or impose - conditions on any use of the Licensed Material that could lawfully - be made without permission under this Public License. - - b. To the extent possible, if any provision of this Public License is - deemed unenforceable, it shall be automatically reformed to the - minimum extent necessary to make it enforceable. If the provision - cannot be reformed, it shall be severed from this Public License - without affecting the enforceability of the remaining terms and - conditions. - - c. No term or condition of this Public License will be waived and no - failure to comply consented to unless expressly agreed to by the - Licensor. - - d. Nothing in this Public License constitutes or may be interpreted - as a limitation upon, or waiver of, any privileges and immunities - that apply to the Licensor or You, including from the legal - processes of any jurisdiction or authority. - -======================================================================= - -Creative Commons is not a party to its public -licenses. Notwithstanding, Creative Commons may elect to apply one of -its public licenses to material it publishes and in those instances -will be considered the “Licensor.” The text of the Creative Commons -public licenses is dedicated to the public domain under the CC0 Public -Domain Dedication. Except for the limited purpose of indicating that -material is shared under a Creative Commons public license or as -otherwise permitted by the Creative Commons policies published at -creativecommons.org/policies, Creative Commons does not authorize the -use of the trademark "Creative Commons" or any other trademark or logo -of Creative Commons without its prior written consent including, -without limitation, in connection with any unauthorized modifications -to any of its public licenses or any other arrangements, -understandings, or agreements concerning use of licensed material. For -the avoidance of doubt, this paragraph does not form part of the -public licenses. - -Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/simsiam/README.md b/simsiam/README.md deleted file mode 100644 index 47bab1b..0000000 --- a/simsiam/README.md +++ /dev/null @@ -1,96 +0,0 @@ -# SimSiam: Exploring Simple Siamese Representation Learning - -
-
-
| pre-train epochs |
-batch size |
-pre-train ckpt |
-pre-train log |
-linear cls. ckpt |
-linear cls. log |
-top-1 acc. | - -
|---|---|---|---|---|---|---|
| 100 | -512 | -link | -link | -link | -link | -68.1 | -
| 100 | -256 | -link | -link | -link | -link | -68.3 | -