From acb48e2b17a82d0296246a8ed9dee72c24eb4c6c Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 24 Jan 2024 10:54:10 -0800
Subject: [PATCH 01/57] Distributed rcdm

---
 .../data_aug/contrastive_learning_dataset.py  | 18 +++++++------
 run_simCLR.py                                 | 25 +++++++++----------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index d8eb61d..5e9e709 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -13,7 +13,7 @@ def __init__(self, root_folder):
         self.root_folder = root_folder
 
     @staticmethod
-    def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True):
+    def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=None):
         """Return a set of data augmentation transformations as described in the SimCLR paper.
 
         Args:
@@ -32,19 +32,21 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True):
         ]
         if rcdm_agumentation:
             rcdm_config = get_config()
-            transform_list.append(RCDMInference(rcdm_config))
+            transform_list.append(RCDMInference(rcdm_config, device_id))
             transform_list.append(transforms.Resize(size=(size, size)))
 
+
         return transforms.Compose(transform_list)
 
-    def get_dataset(self, name, n_views, rcdm_agumentation=True):
+    def get_dataset(self, name, n_views, rcdm_agumentation=False, device_id=None):
         valid_datasets = {
             "cifar10": lambda: datasets.CIFAR10(
                 self.root_folder,
                 train=True,
                 transform=ContrastiveLearningViewGenerator(
                     self.get_simclr_pipeline_transform(
-                        32, rcdm_agumentation=rcdm_agumentation
+                        32, rcdm_agumentation=rcdm_agumentation,
+                        device_id=device_id
                     ),
                     n_views,
                 ),
@@ -55,7 +57,8 @@ def get_dataset(self, name, n_views, rcdm_agumentation=True):
                 split="unlabeled",
                 transform=ContrastiveLearningViewGenerator(
                     self.get_simclr_pipeline_transform(
-                        96, rcdm_agumentation=rcdm_agumentation
+                        96, rcdm_agumentation=rcdm_agumentation,
+                        device_id=device_id
                     ),
                     n_views,
                 ),
@@ -66,7 +69,8 @@ def get_dataset(self, name, n_views, rcdm_agumentation=True):
                 split="train",
                 transform=ContrastiveLearningViewGenerator(
                     self.get_simclr_pipeline_transform(
-                        224, rcdm_agumentation=rcdm_agumentation
+                        224, rcdm_agumentation=rcdm_agumentation,
+                        device_id=device_id
                     ),
                     n_views,
                 ),
@@ -78,4 +82,4 @@ def get_dataset(self, name, n_views, rcdm_agumentation=True):
         except KeyError:
             raise InvalidDatasetSelection()
         else:
-            return dataset_fn()
+            return dataset_fn()
\ No newline at end of file
diff --git a/run_simCLR.py b/run_simCLR.py
index f4d6b84..564416b 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -110,7 +110,6 @@
 parser.add_argument(
     "--rcdm_agumentation", action="store_true", help="Use RCDM agumentation or not."
 )
-
 parser.add_argument(
     "--distributed_mode", action="store_true", help="Enable distributed training"
 )
@@ -145,30 +144,30 @@ def main():
         args.n_views == 2
     ), "Only two view training is supported. Please use --n-views 2."
 
+    if args.distributed_mode:
+        dist_utils.init_distributed_mode(
+            launcher=args.distributed_launcher,
+            backend=args.distributed_backend,
+        )
+        device_id = torch.cuda.current_device()
+    else:
+        device_id = None
+
     dataset = ContrastiveLearningDataset(args.data)
     train_dataset = dataset.get_dataset(
         args.dataset_name,
         args.n_views,
         args.rcdm_agumentation,
+        device_id
     )
-
     train_sampler = None
-    if args.distributed_mode:
-        dist_utils.init_distributed_mode(
-            launcher=args.distributed_launcher,
-            backend=args.distributed_backend,
-        )
-        device_id = torch.cuda.current_device()
 
-        if dist_utils.is_dist_avail_and_initialized():
+    if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode:
             train_sampler = DistributedSampler(
                 train_dataset,
                 seed=args.seed,
                 drop_last=True,
             )
-    else:
-        device_id = None
-
     init_fn = partial(
         worker_init_fn,
         num_workers=args.num_workers,
@@ -218,4 +217,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From cdec0c9ab3e7ac414c63842b59bed0a2c661b59b Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Thu, 25 Jan 2024 15:27:12 -0800
Subject: [PATCH 02/57] Add rcdm model

---
 .../data_aug/contrastive_learning_dataset.py  | 36 ++++++++++++-------
 SimCLR/data_aug/rcdm_aug.py                   | 14 ++++----
 SimCLR/data_aug/rcdm_config.py                |  7 ++--
 rcdm/guided_diffusion_rcdm/get_rcdm_models.py | 20 +++++------
 rcdm/guided_diffusion_rcdm/get_ssl_models.py  | 10 +++---
 run_simCLR.py                                 |  7 ++--
 6 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index 5e9e709..71552b0 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -6,7 +6,7 @@
 from SimCLR.data_aug.rcdm_config import get_config
 from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator
 from SimCLR.exceptions.exceptions import InvalidDatasetSelection
-
+import random
 
 class ContrastiveLearningDataset:
     def __init__(self, root_folder):
@@ -21,19 +21,29 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N
             s (float, optional): Magnitude of the color distortion. Defaults to 1.
             rcdm_agumentation (bool, optional): Whether to use RCDM augmentation. Defaults to True.
         """
-        color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s)
-        transform_list = [
-            transforms.RandomResizedCrop(size=size),
-            transforms.RandomHorizontalFlip(),
-            transforms.RandomApply([color_jitter], p=0.8),
-            transforms.RandomGrayscale(p=0.2),
-            GaussianBlur(kernel_size=int(0.1 * size)),
-            transforms.ToTensor(),
-        ]
-        if rcdm_agumentation:
+        prob = random.uniform(0, 1)
+        if prob < 0.5 and rcdm_agumentation:
             rcdm_config = get_config()
-            transform_list.append(RCDMInference(rcdm_config, device_id))
-            transform_list.append(transforms.Resize(size=(size, size)))
+            timestep_respacing = ["ddim10", "ddim25", "ddim50"]
+            rcdm_config.timestep_respacing = timestep_respacing[random.randrange(len(timestep_respacing))]
+            transform_list = [
+                transforms.Resize(size=(size,size)),
+                transforms.ToTensor(),
+                RCDMInference(rcdm_config, device_id),
+                transforms.RandomHorizontalFlip(),
+                transforms.Resize(size=(size,size)),
+            ]
+        else:
+            color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s)
+            transform_list = [
+                transforms.RandomResizedCrop(size=size),
+                transforms.RandomHorizontalFlip(),
+                transforms.RandomApply([color_jitter], p=0.8),
+                transforms.RandomGrayscale(p=0.2),
+                GaussianBlur(kernel_size=int(0.1 * size)),
+                transforms.ToTensor(),
+            ]
+
 
 
         return transforms.Compose(transform_list)
diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index fa4566d..2d917df 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -1,5 +1,4 @@
 import torch
-from rcdm.guided_diffusion_rcdm import dist_util
 from rcdm.guided_diffusion_rcdm.get_ssl_models import get_model
 from rcdm.guided_diffusion_rcdm.get_rcdm_models import get_dict_rcdm_model
 from rcdm.guided_diffusion_rcdm.script_util import (
@@ -9,14 +8,15 @@
 
 
 class RCDMInference(object):
-    def __init__(self, config):
+    def __init__(self, config, device_id):
         """
         Initialize the RCDMInference class with necessary parameters and load models.
         """
         self.config = config
+        self.device_id = device_id
 
         # Load SSL model
-        self.ssl_model = get_model(self.config.type_model, self.config.use_head).cuda().eval()
+        self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.model_dir).cuda(self.device_id).eval()
         for p in self.ssl_model.parameters():
             p.requires_grad = False
 
@@ -29,11 +29,11 @@ def __init__(self, config):
         )
 
         if self.config.model_path == "":
-            trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head)
+            trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.model_dir)
         else:
             trained_model = torch.load(self.config.model_path, map_location="cpu")
         self.model.load_state_dict(trained_model, strict=True)
-        self.model.to(dist_util.dev())
+        self.model.cuda(self.device_id)
 
     def __call__(self, img):
         """
@@ -49,7 +49,7 @@ def __call__(self, img):
 
         sample_fn = self.diffusion.p_sample_loop if not self.config.use_ddim else self.diffusion.ddim_sample_loop
 
-        img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda()
+        img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda(self.device_id)
         model_kwargs = {}
 
         with torch.no_grad():
@@ -63,4 +63,4 @@ def __call__(self, img):
         )
 
         print("Sampling completed!")
-        return sample.squeeze(0).cpu()
\ No newline at end of file
+        return sample.detach().squeeze(0)
\ No newline at end of file
diff --git a/SimCLR/data_aug/rcdm_config.py b/SimCLR/data_aug/rcdm_config.py
index ad7d3b5..4af056f 100644
--- a/SimCLR/data_aug/rcdm_config.py
+++ b/SimCLR/data_aug/rcdm_config.py
@@ -4,14 +4,15 @@ def get_config():
     config = ml_collections.ConfigDict()
     config.image_size = 128  # The size of the images to generate.
     config.class_cond = False  # If true, use class conditional generation.
+    config.model_dir = "/ssd003/projects/aieng/genssl" # Path to the directory containing the model.
     config.type_model = "simclr"  # Type of model to use (e.g., simclr, dino).
     config.use_head = False  # If true, use the projector/head for SSL representation.
     config.model_path = ""  # Replace with the path to your model if you have one.
-    config.use_ddim = False  # If true, use DDIM sampler.
+    config.use_ddim = True  # If true, use DDIM sampler.
     config.no_shared = True  # If false, enables squeeze and excitation.
     config.clip_denoised = True  # If true, clip denoised images.
     config.attention_resolutions = "32,16,8"  # Resolutions to use for attention layers.
-    config.diffusion_steps = 100  # Number of diffusion steps.
+    config.diffusion_steps = 1000  # Number of diffusion steps.
     config.learn_sigma = True  # If true, learn the noise level.
     config.noise_schedule = "linear"  # Type of noise schedule (e.g., linear).
     config.num_channels = 256  # Number of channels in the model.
@@ -22,5 +23,7 @@ def get_config():
     config.use_scale_shift_norm = True  # If true, use scale-shift normalization.
     config.ssl_image_size = 224  # Size of the input images for the SSL model.
     config.ssl_image_channels = 3  # Number of channels of the input images for the SSL model.
+    config.num_images = 1 # Number of images to generate.
+    config.timestep_respacing = "ddim10" # Type of timestep respacing (e.g., ddim25).
 
     return config
\ No newline at end of file
diff --git a/rcdm/guided_diffusion_rcdm/get_rcdm_models.py b/rcdm/guided_diffusion_rcdm/get_rcdm_models.py
index 326ba6c..83ddcfb 100644
--- a/rcdm/guided_diffusion_rcdm/get_rcdm_models.py
+++ b/rcdm/guided_diffusion_rcdm/get_rcdm_models.py
@@ -4,41 +4,41 @@
 import torch.nn as nn
 from torchvision import models as torchvision_models
 
-def get_dict_rcdm_model(model="dino", use_head=False):
+def get_dict_rcdm_model(model="dino", use_head=False, model_dir='./'):
     '''
     Download checkpoints of RCDM.
     '''
 
     if model == "supervised":
-        trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_supervised.pt", map_location="cpu")
+        trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_supervised.pt", map_location="cpu", model_dir=model_dir)
         return trained_model
 
     elif model == "simclr":
         if use_head:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_head.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_head.pt", map_location="cpu", model_dir=model_dir)
         else:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_trunk.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_trunk.pt", map_location="cpu", model_dir=model_dir)
         return trained_model
 
     elif model == "barlow":
         if use_head:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_head.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_head.pt", map_location="cpu", model_dir=model_dir)
         else:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_trunk.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_trunk.pt", map_location="cpu", model_dir=model_dir)
         return trained_model
 
     elif model == "vicreg":
         if use_head:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_head.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_head.pt", map_location="cpu", model_dir=model_dir)
         else:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_trunk.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_trunk.pt", map_location="cpu", model_dir=model_dir)
         return trained_model
 
     elif model == "dino":
         if use_head:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_head.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_head.pt", map_location="cpu", model_dir=model_dir)
         else:
-            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_trunk.pt", map_location="cpu")
+            trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_trunk.pt", map_location="cpu", model_dir=model_dir)
         return trained_model
 
     else:
diff --git a/rcdm/guided_diffusion_rcdm/get_ssl_models.py b/rcdm/guided_diffusion_rcdm/get_ssl_models.py
index db5bd60..b8e3978 100644
--- a/rcdm/guided_diffusion_rcdm/get_ssl_models.py
+++ b/rcdm/guided_diffusion_rcdm/get_ssl_models.py
@@ -89,7 +89,7 @@ def forward(self, x):
         x = torch.nn.functional.normalize(x, dim=-1, p=2).detach()
         return x
 
-def get_model(model="dino", use_head=False):
+def get_model(model="dino", use_head=False, model_dir='./'):
     '''
     Select a model that will be used to compute the embeddings needed by RCDM.
     You can use any kind of model, ConvNets/MLPs, or VITs.
@@ -109,7 +109,7 @@ def get_model(model="dino", use_head=False):
                 nlayers=2,
                 use_bn=True,
         )
-        pretrained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain_full_checkpoint.pth", map_location="cpu") 
+        pretrained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain_full_checkpoint.pth", map_location="cpu", model_dir=model_dir) 
         pretrained_model = pretrained_model["teacher"]
         if "state_dict" in pretrained_model:
             pretrained_model = pretrained_model["state_dict"]
@@ -130,7 +130,7 @@ def get_model(model="dino", use_head=False):
     elif model == "simclr":
         embedding_model = torchvision_models.resnet50()
         embedding_model.fc = nn.Identity()
-        pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/simclr_rn50_1000ep_simclr_8node_resnet_16_07_20.afe428c7/model_final_checkpoint_phase999.torch", map_location="cpu")
+        pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/simclr_rn50_1000ep_simclr_8node_resnet_16_07_20.afe428c7/model_final_checkpoint_phase999.torch", map_location="cpu", model_dir=model_dir)
         # Load trunk
         pretrained_model = pretrained_model_base["classy_state_dict"]["base_model"]["model"]["trunk"]
         pretrained_model = {k.replace("_feature_blocks.", ""): v for k, v in pretrained_model.items()}
@@ -149,7 +149,7 @@ def get_model(model="dino", use_head=False):
     elif model == "barlow":
         embedding_model = torchvision_models.resnet50()
         embedding_model.fc = nn.Identity()
-        pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/barlow_twins/barlow_twins_32gpus_4node_imagenet1k_1000ep_resnet50.torch", map_location="cpu")
+        pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/barlow_twins/barlow_twins_32gpus_4node_imagenet1k_1000ep_resnet50.torch", map_location="cpu", model_dir=model_dir)
         # Load trunk        
         pretrained_model = pretrained_model_base["classy_state_dict"]["base_model"]["model"]["trunk"]
         pretrained_model = {k.replace("_feature_blocks.", ""): v for k, v in pretrained_model.items()}
@@ -169,7 +169,7 @@ def get_model(model="dino", use_head=False):
     elif model == "vicreg":
         embedding_model = torchvision_models.resnet50()
         embedding_model.fc = nn.Identity()
-        pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vicreg/resnet50_fullckpt.pth", map_location="cpu")
+        pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vicreg/resnet50_fullckpt.pth", map_location="cpu", model_dir=model_dir)
         embedding_model.classifier = nn.Identity()
         embedding_model.projector = Projector(emb=8192)
         pretrained = "resnet50_fullckpt.pth"
diff --git a/run_simCLR.py b/run_simCLR.py
index 564416b..4ce3f74 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -140,9 +140,11 @@ def main():
     args = parser.parse_args()
     print(args)
 
+    torch.multiprocessing.set_start_method("spawn")
+
     assert (
         args.n_views == 2
-    ), "Only two view training is supported. Please use --n-views 2."
+    ), "Only two view training is supported. Please use --n-views 2." 
 
     if args.distributed_mode:
         dist_utils.init_distributed_mode(
@@ -168,6 +170,7 @@ def main():
                 seed=args.seed,
                 drop_last=True,
             )
+
     init_fn = partial(
         worker_init_fn,
         num_workers=args.num_workers,
@@ -181,7 +184,7 @@ def main():
         sampler=train_sampler,
         num_workers=args.num_workers,
         worker_init_fn=init_fn,
-        pin_memory=True,
+        pin_memory=False,
         drop_last=True,
     )
 

From 23cdf804c07178845fa7183bb6a3c6cac322c985 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Thu, 25 Jan 2024 17:31:28 -0800
Subject: [PATCH 03/57] Add checkpointing

---
 SimCLR/simclr.py | 32 ++++++++++++++++++--------------
 run_simCLR.py    |  2 ++
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py
index 661fdf2..8a06401 100644
--- a/SimCLR/simclr.py
+++ b/SimCLR/simclr.py
@@ -18,7 +18,10 @@ def __init__(self, *args, **kwargs):
         self.optimizer = kwargs["optimizer"]
         self.scheduler = kwargs["scheduler"]
         self.device_id = kwargs["device_id"]
-        self.writer = SummaryWriter()
+        log_dir = os.path.join(self.args.model_dir, self.args.experiment_name)
+        if not os.isdir(log_dir):
+            os.mkdir(log_dir)
+        self.writer = SummaryWriter(log_dir=log_dir)
         self.criterion = torch.nn.CrossEntropyLoss().cuda(self.device_id)
 
     def simclr_logits_and_labels(self, features):
@@ -91,7 +94,7 @@ def train(self, train_loader):
 
                 if n_iter % self.args.log_every_n_steps == 0:
                     print(
-                        f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss}",
+                        f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss},acc: {accuracy(logits, labels, topk=(1, 5))}",
                     )
                     top1, top5 = accuracy(logits, labels, topk=(1, 5))
                     self.writer.add_scalar("loss", loss, global_step=n_iter)
@@ -102,6 +105,18 @@ def train(self, train_loader):
                         self.scheduler.get_last_lr()[0],
                         global_step=n_iter,
                     )
+                    # save model checkpoints
+                    checkpoint_name = "checkpoint_{:04d}.pth.tar".format(self.args.epochs)
+                    save_checkpoint(
+                        {
+                            "n_iter": n_iter,
+                            "arch": self.args.arch,
+                            "state_dict": self.model.state_dict(),
+                            "optimizer": self.optimizer.state_dict(),
+                        },
+                        is_best=False,
+                        filename=os.path.join(self.writer.log_dir, checkpoint_name),
+                    )
 
                 n_iter += 1
 
@@ -112,16 +127,5 @@ def train(self, train_loader):
             print(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}")
 
         print("Training has finished.")
-        # save model checkpoints
-        checkpoint_name = "checkpoint_{:04d}.pth.tar".format(self.args.epochs)
-        save_checkpoint(
-            {
-                "epoch": self.args.epochs,
-                "arch": self.args.arch,
-                "state_dict": self.model.state_dict(),
-                "optimizer": self.optimizer.state_dict(),
-            },
-            is_best=False,
-            filename=os.path.join(self.writer.log_dir, checkpoint_name),
-        )
+
         print(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.")
diff --git a/run_simCLR.py b/run_simCLR.py
index 4ce3f74..3cb75d6 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -115,6 +115,8 @@
 )
 parser.add_argument("--distributed_launcher", default="slurm")
 parser.add_argument("--distributed_backend", default="nccl")
+parser.add_argument("--model_dir", default="model_checkpoints")
+parser.add_argument("--experiment_name", default="simclr")
 
 
 def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None:

From ae427371edbdc4a9e3f40aebc2587f4970f57fc5 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Thu, 25 Jan 2024 17:36:58 -0800
Subject: [PATCH 04/57] update

---
 SimCLR/data_aug/rcdm_aug.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index 2d917df..5cb22dd 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -16,7 +16,7 @@ def __init__(self, config, device_id):
         self.device_id = device_id
 
         # Load SSL model
-        self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.model_dir).cuda(self.device_id).eval()
+        self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval()
         for p in self.ssl_model.parameters():
             p.requires_grad = False
 
@@ -29,7 +29,7 @@ def __init__(self, config, device_id):
         )
 
         if self.config.model_path == "":
-            trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.model_dir)
+            trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.config.model_dir)
         else:
             trained_model = torch.load(self.config.model_path, map_location="cpu")
         self.model.load_state_dict(trained_model, strict=True)

From 73c580b7f04635edd9b83938617d177ba42c68c6 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Thu, 25 Jan 2024 17:41:41 -0800
Subject: [PATCH 05/57] update

---
 SimCLR/simclr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py
index 8a06401..518fe8b 100644
--- a/SimCLR/simclr.py
+++ b/SimCLR/simclr.py
@@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs):
         self.scheduler = kwargs["scheduler"]
         self.device_id = kwargs["device_id"]
         log_dir = os.path.join(self.args.model_dir, self.args.experiment_name)
-        if not os.isdir(log_dir):
+        if not os.path.exists(log_dir):
             os.mkdir(log_dir)
         self.writer = SummaryWriter(log_dir=log_dir)
         self.criterion = torch.nn.CrossEntropyLoss().cuda(self.device_id)

From 25936dca4dc8db5fe0a8ef8b7ed025e51b74485e Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Thu, 25 Jan 2024 17:52:56 -0800
Subject: [PATCH 06/57] update config

---
 SimCLR/data_aug/contrastive_learning_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index 71552b0..9fbdc87 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -24,7 +24,7 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N
         prob = random.uniform(0, 1)
         if prob < 0.5 and rcdm_agumentation:
             rcdm_config = get_config()
-            timestep_respacing = ["ddim10", "ddim25", "ddim50"]
+            timestep_respacing = ["ddim10", "ddim25"]
             rcdm_config.timestep_respacing = timestep_respacing[random.randrange(len(timestep_respacing))]
             transform_list = [
                 transforms.Resize(size=(size,size)),

From 2d9d1971382c409bcb6b49556ffc8c32aab69517 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Thu, 25 Jan 2024 17:59:41 -0800
Subject: [PATCH 07/57] edit logging

---
 SimCLR/simclr.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py
index 518fe8b..5822cd8 100644
--- a/SimCLR/simclr.py
+++ b/SimCLR/simclr.py
@@ -93,8 +93,9 @@ def train(self, train_loader):
                 scaler.update()
 
                 if n_iter % self.args.log_every_n_steps == 0:
+                    top1, top5 = accuracy(logits, labels, topk=(1, 5))
                     print(
-                        f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss},acc: {accuracy(logits, labels, topk=(1, 5))}",
+                        f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss},acc: top1 - {top1[0]}, top5 - {top5[0]}",
                     )
                     top1, top5 = accuracy(logits, labels, topk=(1, 5))
                     self.writer.add_scalar("loss", loss, global_step=n_iter)

From 422a10dc29719886f41807795f63b32f907b4613 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 30 Jan 2024 13:54:22 -0500
Subject: [PATCH 08/57] server

---
 .../data_aug/contrastive_learning_dataset.py  |   4 +-
 SimCLR/data_aug/rcdm_aug.py                   | 128 +++++++++---------
 SimCLR/data_aug/rcdm_config.py                |  56 ++++----
 3 files changed, 92 insertions(+), 96 deletions(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index 9fbdc87..dd786ce 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -22,10 +22,8 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N
             rcdm_agumentation (bool, optional): Whether to use RCDM augmentation. Defaults to True.
         """
         prob = random.uniform(0, 1)
-        if prob < 0.5 and rcdm_agumentation:
+        if prob < 1 and rcdm_agumentation:
             rcdm_config = get_config()
-            timestep_respacing = ["ddim10", "ddim25"]
-            rcdm_config.timestep_respacing = timestep_respacing[random.randrange(len(timestep_respacing))]
             transform_list = [
                 transforms.Resize(size=(size,size)),
                 transforms.ToTensor(),
diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index 5cb22dd..8ea9e23 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -1,66 +1,64 @@
-import torch
-from rcdm.guided_diffusion_rcdm.get_ssl_models import get_model
-from rcdm.guided_diffusion_rcdm.get_rcdm_models import get_dict_rcdm_model
-from rcdm.guided_diffusion_rcdm.script_util import (
-    model_and_diffusion_defaults,
-    create_model_and_diffusion
-)
-
-
-class RCDMInference(object):
-    def __init__(self, config, device_id):
-        """
-        Initialize the RCDMInference class with necessary parameters and load models.
-        """
-        self.config = config
-        self.device_id = device_id
-
-        # Load SSL model
-        self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval()
-        for p in self.ssl_model.parameters():
-            p.requires_grad = False
-
-        # Load RCDM model
-        model_defaults = model_and_diffusion_defaults()
-        model_args = {k: getattr(self.config, k, model_defaults[k]) for k in model_defaults}
-        
-        self.model, self.diffusion = create_model_and_diffusion(
-            **model_args, G_shared=self.config.no_shared, feat_cond=True, ssl_dim=self.ssl_model(torch.zeros(1, config.ssl_image_channels, config.ssl_image_size, config.ssl_image_size).cuda()).size(1)
-        )
-
-        if self.config.model_path == "":
-            trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.config.model_dir)
-        else:
-            trained_model = torch.load(self.config.model_path, map_location="cpu")
-        self.model.load_state_dict(trained_model, strict=True)
-        self.model.cuda(self.device_id)
-
-    def __call__(self, img):
-        """
-        Run the RCDM model inference on an images.
-
-        Args:
-            img (torch.Tensor): An image to apply RCDM to.
-
-        Returns:
-            List[torch.Tensor]: List of generated image tensors.
-        """
-        print("Starting RCDM model inference...")
-
-        sample_fn = self.diffusion.p_sample_loop if not self.config.use_ddim else self.diffusion.ddim_sample_loop
-
-        img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda(self.device_id)
-        model_kwargs = {}
-
-        with torch.no_grad():
-            feat = self.ssl_model(img).detach()
-            model_kwargs["feat"] = feat
-        sample = sample_fn(
-            self.model,
-            (1, 3, self.config.image_size, self.config.image_size),
-            clip_denoised=self.config.clip_denoised,
-            model_kwargs=model_kwargs,
-        )
-
-        print("Sampling completed!")
+import torch
+from rcdm.guided_diffusion_rcdm.get_ssl_models import get_model
+from rcdm.guided_diffusion_rcdm.get_rcdm_models import get_dict_rcdm_model
+from rcdm.guided_diffusion_rcdm.script_util import (
+    model_and_diffusion_defaults,
+    create_model_and_diffusion
+)
+
+
+class RCDMInference(object):
+    def __init__(self, config, device_id):
+        """
+        Initialize the RCDMInference class with necessary parameters and load models.
+        """
+        self.config = config
+        self.device_id = device_id
+
+        # Load SSL model
+        self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval()
+        for p in self.ssl_model.parameters():
+            p.requires_grad = False
+
+        # Load RCDM model
+        model_defaults = model_and_diffusion_defaults()
+        model_args = {k: getattr(self.config, k, model_defaults[k]) for k in model_defaults}
+        
+        self.model, self.diffusion = create_model_and_diffusion(
+            **model_args, G_shared=self.config.no_shared, feat_cond=True, ssl_dim=self.ssl_model(torch.zeros(1, config.ssl_image_channels, config.ssl_image_size, config.ssl_image_size).cuda()).size(1)
+        )
+
+        if self.config.model_path == "":
+            trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.config.model_dir)
+        else:
+            trained_model = torch.load(self.config.model_path, map_location="cpu")
+        self.model.load_state_dict(trained_model, strict=True)
+        self.model.cuda(self.device_id)
+
+    def __call__(self, img):
+        """
+        Run the RCDM model inference on an images.
+
+        Args:
+            img (torch.Tensor): An image to apply RCDM to.
+
+        Returns:
+            List[torch.Tensor]: List of generated image tensors.
+        """
+
+        sample_fn = self.diffusion.p_sample_loop if not self.config.use_ddim else self.diffusion.ddim_sample_loop
+
+        img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda(self.device_id)
+        model_kwargs = {}
+
+        with torch.no_grad():
+            feat = self.ssl_model(img).detach()
+            model_kwargs["feat"] = feat
+        sample = sample_fn(
+            self.model,
+            (1, 3, self.config.image_size, self.config.image_size),
+            clip_denoised=self.config.clip_denoised,
+            model_kwargs=model_kwargs,
+        )
+
         return sample.detach().squeeze(0)
\ No newline at end of file
diff --git a/SimCLR/data_aug/rcdm_config.py b/SimCLR/data_aug/rcdm_config.py
index 4af056f..0525a1a 100644
--- a/SimCLR/data_aug/rcdm_config.py
+++ b/SimCLR/data_aug/rcdm_config.py
@@ -1,29 +1,29 @@
-import ml_collections
-
-def get_config():
-    config = ml_collections.ConfigDict()
-    config.image_size = 128  # The size of the images to generate.
-    config.class_cond = False  # If true, use class conditional generation.
-    config.model_dir = "/ssd003/projects/aieng/genssl" # Path to the directory containing the model.
-    config.type_model = "simclr"  # Type of model to use (e.g., simclr, dino).
-    config.use_head = False  # If true, use the projector/head for SSL representation.
-    config.model_path = ""  # Replace with the path to your model if you have one.
-    config.use_ddim = True  # If true, use DDIM sampler.
-    config.no_shared = True  # If false, enables squeeze and excitation.
-    config.clip_denoised = True  # If true, clip denoised images.
-    config.attention_resolutions = "32,16,8"  # Resolutions to use for attention layers.
-    config.diffusion_steps = 1000  # Number of diffusion steps.
-    config.learn_sigma = True  # If true, learn the noise level.
-    config.noise_schedule = "linear"  # Type of noise schedule (e.g., linear).
-    config.num_channels = 256  # Number of channels in the model.
-    config.num_heads = 4  # Number of attention heads.
-    config.num_res_blocks = 2  # Number of residual blocks.
-    config.resblock_updown = True  # If true, use up/down sampling in resblocks.
-    config.use_fp16 = False  # If true, use 16-bit floating point precision.
-    config.use_scale_shift_norm = True  # If true, use scale-shift normalization.
-    config.ssl_image_size = 224  # Size of the input images for the SSL model.
-    config.ssl_image_channels = 3  # Number of channels of the input images for the SSL model.
-    config.num_images = 1 # Number of images to generate.
-    config.timestep_respacing = "ddim10" # Type of timestep respacing (e.g., ddim25).
-
+import ml_collections
+
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.image_size = 128  # The size of the images to generate.
+    config.class_cond = False  # If true, use class conditional generation.
+    config.model_dir = "/ssd003/projects/aieng/genssl" # Path to the directory containing the model.
+    config.type_model = "simclr"  # Type of model to use (e.g., simclr, dino).
+    config.use_head = False  # If true, use the projector/head for SSL representation.
+    config.model_path = ""  # Replace with the path to your model if you have one.
+    config.use_ddim = True  # If true, use DDIM sampler.
+    config.no_shared = True  # If false, enables squeeze and excitation.
+    config.clip_denoised = True  # If true, clip denoised images.
+    config.attention_resolutions = "32,16,8"  # Resolutions to use for attention layers.
+    config.diffusion_steps = 1000  # Number of diffusion steps.
+    config.learn_sigma = True  # If true, learn the noise level.
+    config.noise_schedule = "linear"  # Type of noise schedule (e.g., linear).
+    config.num_channels = 256  # Number of channels in the model.
+    config.num_heads = 4  # Number of attention heads.
+    config.num_res_blocks = 2  # Number of residual blocks.
+    config.resblock_updown = True  # If true, use up/down sampling in resblocks.
+    config.use_fp16 = False  # If true, use 16-bit floating point precision.
+    config.use_scale_shift_norm = True  # If true, use scale-shift normalization.
+    config.ssl_image_size = 224  # Size of the input images for the SSL model.
+    config.ssl_image_channels = 3  # Number of channels of the input images for the SSL model.
+    config.num_images = 1 # Number of images to generate.
+    config.timestep_respacing = "ddim2" # Type of timestep respacing (e.g., ddim25).
+
     return config
\ No newline at end of file

From 60d3dc3666662b436c01d301e411c266e819e7a9 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 30 Jan 2024 11:02:56 -0800
Subject: [PATCH 09/57] add checkpointing

---
 SimCLR/data_aug/contrastive_learning_dataset.py |  2 +-
 SimCLR/simclr.py                                | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index dd786ce..c45de87 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -22,7 +22,7 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N
             rcdm_agumentation (bool, optional): Whether to use RCDM augmentation. Defaults to True.
         """
         prob = random.uniform(0, 1)
-        if prob < 1 and rcdm_agumentation:
+        if prob < 0.5 and rcdm_agumentation:
             rcdm_config = get_config()
             transform_list = [
                 transforms.Resize(size=(size,size)),
diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py
index 5822cd8..129d146 100644
--- a/SimCLR/simclr.py
+++ b/SimCLR/simclr.py
@@ -106,8 +106,9 @@ def train(self, train_loader):
                         self.scheduler.get_last_lr()[0],
                         global_step=n_iter,
                     )
-                    # save model checkpoints
-                    checkpoint_name = "checkpoint_{:04d}.pth.tar".format(self.args.epochs)
+                    
+                    # save latest model checkpoints
+                    checkpoint_name = "checkpoint_final.pth.tar"
                     save_checkpoint(
                         {
                             "n_iter": n_iter,
@@ -126,6 +127,18 @@ def train(self, train_loader):
                 self.scheduler.step()
 
             print(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}")
+            # save model checkpoints after epochs
+            checkpoint_name = "checkpoint_epoch_{:04d}.pth.tar".format(epoch_counter)
+            save_checkpoint(
+                {
+                    "n_iter": n_iter,
+                    "arch": self.args.arch,
+                    "state_dict": self.model.state_dict(),
+                    "optimizer": self.optimizer.state_dict(),
+                },
+                is_best=False,
+                filename=os.path.join(self.writer.log_dir, checkpoint_name),
+            )
 
         print("Training has finished.")
 

From 9ae6ee87e5c309c6fa53b471b908ddc30866f74c Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 16:56:56 -0800
Subject: [PATCH 10/57] Add eval files

---
 SimCLR/data_aug/supervised_dataset.py |  56 ++++++
 SimCLR/evaluate_simCLR.py             | 242 ++++++++++++++++++++++++++
 SimCLR/models/resnet_pretrained.py    |  63 +++++++
 3 files changed, 361 insertions(+)
 create mode 100644 SimCLR/data_aug/supervised_dataset.py
 create mode 100644 SimCLR/evaluate_simCLR.py
 create mode 100644 SimCLR/models/resnet_pretrained.py

diff --git a/SimCLR/data_aug/supervised_dataset.py b/SimCLR/data_aug/supervised_dataset.py
new file mode 100644
index 0000000..6b1713a
--- /dev/null
+++ b/SimCLR/data_aug/supervised_dataset.py
@@ -0,0 +1,56 @@
+from torchvision import datasets, transforms
+from torchvision.transforms import transforms
+
+from SimCLR.exceptions.exceptions import InvalidDatasetSelection
+import random
+
+class SupervisedDataset:
+    def __init__(self, root_folder):
+        self.root_folder = root_folder
+
+    @staticmethod
+    def get_transform(size):
+        """Return a set of simple transformations for supervised learning.
+
+        Args:
+            size (int): Image size.
+        """
+        transform_list = [
+            transforms.Resize(size=(size,size)),
+            transforms.ToTensor(),
+        ]
+
+        return transforms.Compose(transform_list)
+
+    
+    def get_dataset(self, name, train = True):
+        if name == "imagenet":
+            if train:
+                split = "train"
+            else:
+                split = "val"
+            return datasets.ImageNet(
+                self.root_folder,
+                split=split,
+                transform=self.get_transform(224),
+            )
+        elif name == "cifar10":
+            return datasets.CIFAR10(
+                self.root_folder,
+                train=train,
+                transform= self.get_transform(32),
+                download=True,
+            )
+        elif name == "stl10":
+            if train:
+                split = "train"
+            else:
+                split = "test"
+            return datasets.STL10(
+                self.root_folder,
+                split=split,
+                transform=self.get_transform(96),
+                download=True,
+            )
+        else:
+            raise InvalidDatasetSelection()
\ No newline at end of file
diff --git a/SimCLR/evaluate_simCLR.py b/SimCLR/evaluate_simCLR.py
new file mode 100644
index 0000000..5910c28
--- /dev/null
+++ b/SimCLR/evaluate_simCLR.py
@@ -0,0 +1,242 @@
+import argparse
+import random
+from functools import partial
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
+from torch.utils.data.distributed import DistributedSampler
+from torchvision import models
+
+from SimCLR import distributed as dist_utils
+from SimCLR.data_aug.supervised_dataset import SupervisedDataset
+from SimCLR.models.resnet_pretrained import PretrainedResNet
+from SimCLR.simclr import SimCLR
+
+
+model_names = sorted(
+    name
+    for name in models.__dict__
+    if name.islower() and not name.startswith("__") and callable(models.__dict__[name])
+)
+
+parser = argparse.ArgumentParser(description="PyTorch SimCLR")
+parser.add_argument(
+    "-data",
+    metavar="DIR",
+    default="/scratch/ssd004/datasets/imagenet256",
+    help="path to dataset, for imagenet: /scratch/ssd004/datasets/imagenet256 ",
+)
+parser.add_argument(
+    "-dataset-name",
+    default="imagenet",
+    help="dataset-name",
+    choices=["stl10", "cifar10", "imagenet"],
+)
+parser.add_argument(
+    "-a",
+    "--arch",
+    metavar="ARCH",
+    default="resnet18",
+    choices=model_names,
+    help="model architecture: " + " | ".join(model_names) + " (default: resnet18)",
+)
+parser.add_argument(
+    "-j",
+    "--num_workers",
+    default=4,
+    type=int,
+    metavar="N",
+    help="number of data loading workers",
+)
+parser.add_argument(
+    "--epochs", default=100, type=int, metavar="N", help="number of total epochs to run"
+)
+parser.add_argument(
+    "-b",
+    "--batch-size",
+    default=64,
+    type=int,
+    metavar="N",
+    help="mini-batch size (default: 256), this is the total "
+    "batch size of all GPUs on the current node when "
+    "using Data Parallel or Distributed Data Parallel",
+)
+
+parser.add_argument(
+    "--seed", default=42, type=int, help="seed for initializing training. "
+)
+
+parser.add_argument(
+    "--log-every-n-steps", default=100, type=int, help="Log every n steps"
+)
+parser.add_argument(
+    "--distributed_mode", action="store_true", help="Enable distributed training"
+)
+parser.add_argument("--distributed_launcher", default="slurm")
+parser.add_argument("--distributed_backend", default="nccl")
+parser.add_argument("--model_dir", default="model_checkpoints")
+parser.add_argument("--experiment_name", default="simclr")
+parser.add_argument("--linear_evaluation", 
+                    action="store_true",
+                    help="Whether or not to evaluate the linear evaluation of the model.")
+
+def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None:
+    """Initialize worker processes with a random seed.
+
+    Parameters
+    ----------
+    worker_id : int
+        ID of the worker process.
+    num_workers : int
+        Total number of workers that will be initialized.
+    rank : int
+        The rank of the current process.
+    seed : int
+        A random seed used determine the worker seed.
+    """
+    worker_seed = num_workers * rank + worker_id + seed
+    torch.manual_seed(worker_seed)
+    random.seed(worker_seed)
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+def main():
+    args = parser.parse_args()
+    print(args)
+
+    torch.multiprocessing.set_start_method("spawn")
+
+    assert (
+        args.n_views == 2
+    ), "Only two view training is supported. Please use --n-views 2." 
+
+    if args.distributed_mode:
+        dist_utils.init_distributed_mode(
+            launcher=args.distributed_launcher,
+            backend=args.distributed_backend,
+        )
+        device_id = torch.cuda.current_device()
+    else:
+        device_id = None
+
+    dataset = SupervisedDataset(args.data)
+    train_dataset = dataset.get_dataset(
+        name = args.dataset_name,
+        train=True,
+    )
+    test_dataset = dataset.get_dataset(
+        name = args.dataset_name,
+        train=False,
+    )
+    train_sampler = None
+    test_sampler = None
+
+    if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode:
+            train_sampler = DistributedSampler(
+                train_dataset,
+                seed=args.seed,
+                drop_last=True,
+            )
+            test_sampler = DistributedSampler(
+                test_dataset,
+                seed=args.seed,
+                drop_last=False,
+            )
+
+    init_fn = partial(
+        worker_init_fn,
+        num_workers=args.num_workers,
+        rank=dist_utils.get_rank(),
+        seed=args.seed,
+    )
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.batch_size,
+        shuffle=(train_sampler is None),
+        sampler=train_sampler,
+        num_workers=args.num_workers,
+        worker_init_fn=init_fn,
+        pin_memory=False,
+        drop_last=True,
+    )
+    test_loader = torch.utils.data.DataLoader(
+        test_dataset,
+        batch_size=args.batch_size,
+        shuffle=(test_sampler is None),
+        sampler=test_sampler,
+        num_workers=args.num_workers,
+        worker_init_fn=init_fn,
+        pin_memory=False,
+        drop_last=False,
+    )
+
+    model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.model_dir, linear_eval=args.linear_evaluation)
+
+    if args.distributed_mode and dist_utils.is_dist_avail_and_initialized():
+        # set the single device scope, otherwise DistributedDataParallel will
+        # use all available devices
+        torch.cuda.set_device(device_id)
+        model = model.cuda(device_id)
+        model = DDP(model, device_ids=[device_id])
+    else:
+        model = model.cuda()
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0008)
+    criterion = torch.nn.CrossEntropyLoss().cuda(device_id)
+
+    for epoch in range(args.epochs):
+        if dist_utils.is_dist_avail_and_initialized():
+            train_loader.sampler.set_epoch(epoch)
+        top1_train_accuracy = 0
+        for counter, (x_batch, y_batch) in enumerate(train_loader):
+            x_batch = x_batch.cuda(device_id)
+            y_batch = y_batch.cuda(device_id)
+
+            logits = model(x_batch)
+            loss = criterion(logits, y_batch)
+            top1 = accuracy(logits, y_batch, topk=(1,))
+            top1_train_accuracy += top1[0]
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        top1_train_accuracy /= counter + 1
+        top1_accuracy = 0
+        top5_accuracy = 0
+        for counter, (x_batch, y_batch) in enumerate(test_loader):
+            x_batch = x_batch.cuda(device_id)
+            y_batch = y_batch.cuda(device_id)
+
+            logits = model(x_batch)
+
+            top1, top5 = accuracy(logits, y_batch, topk=(1, 5))
+            top1_accuracy += top1[0]
+            top5_accuracy += top5[0]
+
+        top1_accuracy /= counter + 1
+        top5_accuracy /= counter + 1
+        print(
+            f"Epoch {epoch}\tTop1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
+        )
+
+   
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
new file mode 100644
index 0000000..97f1f80
--- /dev/null
+++ b/SimCLR/models/resnet_pretrained.py
@@ -0,0 +1,63 @@
+import torch
+from torch import nn
+from torchvision import models
+
+from ..exceptions.exceptions import InvalidBackboneError
+
+
+class PretrainedResNet(nn.Module):
+    def __init__(self, base_model, pretrained_dir, linear_eval=True):
+        super(PretrainedResNet, self).__init__()
+
+        self.pretrained_dir = pretrained_dir
+
+        self.resnet_dict = {
+            "resnet18": models.resnet18(pretrained=False, num_classes=10),
+            "resnet50": models.resnet50(pretrained=False, num_classes=10),
+        }
+
+        self.backbone = self._get_basemodel(base_model)
+
+        # load pretrained weights
+        log = self._load_pretrained()
+        assert log.missing_keys == ["fc.weight", "fc.bias"]
+
+        if linear_eval:
+            # freeze all layers but the last fc
+            self._freeze_backbone()
+            parameters = list(filter(lambda p: p.requires_grad, self.backbone.parameters()))
+            assert len(parameters) == 2  # fc.weight, fc.bias
+
+    def _load_pretrained(self):
+        checkpoint = torch.load(self.pretrained_dir, map_location='cpu')
+        state_dict = checkpoint["state_dict"]
+        for k in list(state_dict.keys()):
+            if k.startswith("backbone."):
+                if k.startswith("backbone") and not k.startswith("backbone.fc"):
+                    # remove prefix
+                    state_dict[k[len("backbone.") :]] = state_dict[k]
+            del state_dict[k]
+        log = self.backbone.load_state_dict(state_dict, strict=False)
+        return log
+        
+
+    def _freeze_backbone(self):
+        # freeze all layers but the last fc
+        for name, param in self.backbone.named_parameters():
+            if name not in ["fc.weight", "fc.bias"]:
+                param.requires_grad = False
+        return
+
+
+    def _get_basemodel(self, model_name):
+        try:
+            model = self.resnet_dict[model_name]
+        except KeyError:
+            raise InvalidBackboneError(
+                "Invalid backbone architecture. Check the config file and pass one of: resnet18 or resnet50",
+            )
+        else:
+            return model
+
+    def forward(self, x):
+        return self.backbone(x)

From 642ad8a4aee9c77666cfd13344a4ebe05f55210e Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:04:26 -0800
Subject: [PATCH 11/57] add slrm file

---
 eval_simclr.slrm                              | 32 +++++++++++++++++++
 .../evaluate_simCLR.py => evaluate_simCLR.py  |  8 ++---
 2 files changed, 34 insertions(+), 6 deletions(-)
 create mode 100644 eval_simclr.slrm
 rename SimCLR/evaluate_simCLR.py => evaluate_simCLR.py (96%)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
new file mode 100644
index 0000000..39387be
--- /dev/null
+++ b/eval_simclr.slrm
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+#SBATCH --job-name=train_sunrgbd
+#SBATCH --partition=t4v2
+#SBATCH --time=12:00:00
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=4
+#SBATCH --mem-per-cpu=2G
+#SBATCH --output=slurm-%N-%j.out
+#SBATCH --qos=m
+
+PY_ARGS=${@:1}
+
+# load virtual environment
+source /ssd003/projects/aieng/envs/genssl/bin/activate
+
+export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend
+export CUDA_LAUNCH_BLOCKING=1
+
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=45679
+
+export PYTHONPATH="."
+nvidia-smi
+
+# “srun” executes the script <ntasks-per-node * nodes> times
+srun python evaluate_simCLR.py \
+--distributed_mode \
+--batch-size=256 \
+--pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" 
\ No newline at end of file
diff --git a/SimCLR/evaluate_simCLR.py b/evaluate_simCLR.py
similarity index 96%
rename from SimCLR/evaluate_simCLR.py
rename to evaluate_simCLR.py
index 5910c28..ac265c5 100644
--- a/SimCLR/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -66,16 +66,12 @@
     "--seed", default=42, type=int, help="seed for initializing training. "
 )
 
-parser.add_argument(
-    "--log-every-n-steps", default=100, type=int, help="Log every n steps"
-)
 parser.add_argument(
     "--distributed_mode", action="store_true", help="Enable distributed training"
 )
 parser.add_argument("--distributed_launcher", default="slurm")
 parser.add_argument("--distributed_backend", default="nccl")
-parser.add_argument("--model_dir", default="model_checkpoints")
-parser.add_argument("--experiment_name", default="simclr")
+parser.add_argument("--pretrained_model_file", default=None, help="Path to the pretrained model file.")
 parser.add_argument("--linear_evaluation", 
                     action="store_true",
                     help="Whether or not to evaluate the linear evaluation of the model.")
@@ -185,7 +181,7 @@ def main():
         drop_last=False,
     )
 
-    model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.model_dir, linear_eval=args.linear_evaluation)
+    model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation)
 
     if args.distributed_mode and dist_utils.is_dist_avail_and_initialized():
         # set the single device scope, otherwise DistributedDataParallel will

From ea7cc42fb5a7cbe309f7fd26b6b205a76898817b Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:06:29 -0800
Subject: [PATCH 12/57] update eval file

---
 evaluate_simCLR.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index ac265c5..552ad25 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -117,10 +117,6 @@ def main():
 
     torch.multiprocessing.set_start_method("spawn")
 
-    assert (
-        args.n_views == 2
-    ), "Only two view training is supported. Please use --n-views 2." 
-
     if args.distributed_mode:
         dist_utils.init_distributed_mode(
             launcher=args.distributed_launcher,

From a9cbba19323fb4349996b79ef48bfdb2aa30a2a6 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:19:33 -0800
Subject: [PATCH 13/57] update eval

---
 eval_simclr.slrm   |  4 +++-
 evaluate_simCLR.py | 44 +++++++++++++++++++++++++-------------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 39387be..299367c 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -29,4 +29,6 @@ nvidia-smi
 srun python evaluate_simCLR.py \
 --distributed_mode \
 --batch-size=256 \
---pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" 
\ No newline at end of file
+--pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \
+--linear_evaluation \ 
+--log_every_n_steps=10
\ No newline at end of file
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 552ad25..a552849 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -75,6 +75,9 @@
 parser.add_argument("--linear_evaluation", 
                     action="store_true",
                     help="Whether or not to evaluate the linear evaluation of the model.")
+parser.add_argument(
+    "--log-every-n-steps", default=100, type=int, help="Log every n steps"
+)
 
 def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None:
     """Initialize worker processes with a random seed.
@@ -191,6 +194,8 @@ def main():
     optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0008)
     criterion = torch.nn.CrossEntropyLoss().cuda(device_id)
 
+    n_iter = 0
+
     for epoch in range(args.epochs):
         if dist_utils.is_dist_avail_and_initialized():
             train_loader.sampler.set_epoch(epoch)
@@ -207,25 +212,26 @@ def main():
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
-
-        top1_train_accuracy /= counter + 1
-        top1_accuracy = 0
-        top5_accuracy = 0
-        for counter, (x_batch, y_batch) in enumerate(test_loader):
-            x_batch = x_batch.cuda(device_id)
-            y_batch = y_batch.cuda(device_id)
-
-            logits = model(x_batch)
-
-            top1, top5 = accuracy(logits, y_batch, topk=(1, 5))
-            top1_accuracy += top1[0]
-            top5_accuracy += top5[0]
-
-        top1_accuracy /= counter + 1
-        top5_accuracy /= counter + 1
-        print(
-            f"Epoch {epoch}\tTop1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
-        )
+            if n_iter % args.log_every_n_steps == 0:
+                top1_train_accuracy /= counter + 1
+                top1_accuracy = 0
+                top5_accuracy = 0
+                for counter, (x_batch, y_batch) in enumerate(test_loader):
+                    x_batch = x_batch.cuda(device_id)
+                    y_batch = y_batch.cuda(device_id)
+
+                    logits = model(x_batch)
+
+                    top1, top5 = accuracy(logits, y_batch, topk=(1, 5))
+                    top1_accuracy += top1[0]
+                    top5_accuracy += top5[0]
+
+                top1_accuracy /= counter + 1
+                top5_accuracy /= counter + 1
+                print(
+                    f"Epoch {epoch}\t Iter {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
+                )
+            n_iter += 1
 
    
 

From 90ea717217e19b3a3518a1936f2ba23ff4431332 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:22:11 -0800
Subject: [PATCH 14/57] check eval

---
 SimCLR/models/resnet_pretrained.py | 1 +
 eval_simclr.slrm                   | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index 97f1f80..82078a2 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -20,6 +20,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
 
         # load pretrained weights
         log = self._load_pretrained()
+        print(log)
         assert log.missing_keys == ["fc.weight", "fc.bias"]
 
         if linear_eval:
diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 299367c..6d52e84 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -30,5 +30,5 @@ srun python evaluate_simCLR.py \
 --distributed_mode \
 --batch-size=256 \
 --pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \
---linear_evaluation \ 
---log_every_n_steps=10
\ No newline at end of file
+--log-every-n-steps=10 \
+--linear_evaluation 
\ No newline at end of file

From 1cba4c8612cb06bc82f74184147ff2bb17a7bfb5 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:23:40 -0800
Subject: [PATCH 15/57] check

---
 SimCLR/models/resnet_pretrained.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index 82078a2..e319f9d 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -17,6 +17,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
         }
 
         self.backbone = self._get_basemodel(base_model)
+        print(self.backbone.state_dict().keys())
 
         # load pretrained weights
         log = self._load_pretrained()
@@ -32,6 +33,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
     def _load_pretrained(self):
         checkpoint = torch.load(self.pretrained_dir, map_location='cpu')
         state_dict = checkpoint["state_dict"]
+        print(state_dict.keys())
         for k in list(state_dict.keys()):
             if k.startswith("backbone."):
                 if k.startswith("backbone") and not k.startswith("backbone.fc"):

From 6a34a85a9b5f068ee20bb682bbbb624a333c5962 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:26:14 -0800
Subject: [PATCH 16/57] check state_dicts

---
 SimCLR/models/resnet_pretrained.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index e319f9d..6e5df35 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -16,8 +16,8 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
             "resnet50": models.resnet50(pretrained=False, num_classes=10),
         }
 
-        self.backbone = self._get_basemodel(base_model)
-        print(self.backbone.state_dict().keys())
+        model = self._get_basemodel(base_model)
+        print(model.state_dict().keys())
 
         # load pretrained weights
         log = self._load_pretrained()
@@ -29,6 +29,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
             self._freeze_backbone()
             parameters = list(filter(lambda p: p.requires_grad, self.backbone.parameters()))
             assert len(parameters) == 2  # fc.weight, fc.bias
+        self.backbone = model
 
     def _load_pretrained(self):
         checkpoint = torch.load(self.pretrained_dir, map_location='cpu')

From 86dcdf7c734eb22500422d222c2e47ec1e0e14b6 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:29:52 -0800
Subject: [PATCH 17/57] check

---
 SimCLR/models/resnet_pretrained.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index 6e5df35..9cd18dc 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -16,8 +16,8 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
             "resnet50": models.resnet50(pretrained=False, num_classes=10),
         }
 
-        model = self._get_basemodel(base_model)
-        print(model.state_dict().keys())
+        self.backbone = self._get_basemodel(base_model)
+        print(self.backbone.state_dict().keys())
 
         # load pretrained weights
         log = self._load_pretrained()
@@ -29,17 +29,16 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
             self._freeze_backbone()
             parameters = list(filter(lambda p: p.requires_grad, self.backbone.parameters()))
             assert len(parameters) == 2  # fc.weight, fc.bias
-        self.backbone = model
 
     def _load_pretrained(self):
         checkpoint = torch.load(self.pretrained_dir, map_location='cpu')
         state_dict = checkpoint["state_dict"]
         print(state_dict.keys())
         for k in list(state_dict.keys()):
-            if k.startswith("backbone."):
-                if k.startswith("backbone") and not k.startswith("backbone.fc"):
+            if k.startswith("module.backbone."):
+                if k.startswith("module.backbone") and not k.startswith("module.backbone.fc"):
                     # remove prefix
-                    state_dict[k[len("backbone.") :]] = state_dict[k]
+                    state_dict[k[len("module.backbone.") :]] = state_dict[k]
             del state_dict[k]
         log = self.backbone.load_state_dict(state_dict, strict=False)
         return log

From 4e683a4e75ed60d726630c0e6445a37b9803804e Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:32:34 -0800
Subject: [PATCH 18/57] edit eval classes

---
 SimCLR/models/resnet_pretrained.py | 10 ++++------
 evaluate_simCLR.py                 | 11 +++++++++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index 9cd18dc..4c3ea45 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -6,22 +6,21 @@
 
 
 class PretrainedResNet(nn.Module):
-    def __init__(self, base_model, pretrained_dir, linear_eval=True):
+    def __init__(self, base_model, pretrained_dir, linear_eval=True, num_classes=10):
         super(PretrainedResNet, self).__init__()
 
         self.pretrained_dir = pretrained_dir
 
         self.resnet_dict = {
-            "resnet18": models.resnet18(pretrained=False, num_classes=10),
-            "resnet50": models.resnet50(pretrained=False, num_classes=10),
+            "resnet18": models.resnet18(pretrained=False, num_classes=num_classes),
+            "resnet50": models.resnet50(pretrained=False, num_classes=num_classes),
         }
 
         self.backbone = self._get_basemodel(base_model)
-        print(self.backbone.state_dict().keys())
 
         # load pretrained weights
         log = self._load_pretrained()
-        print(log)
+
         assert log.missing_keys == ["fc.weight", "fc.bias"]
 
         if linear_eval:
@@ -33,7 +32,6 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True):
     def _load_pretrained(self):
         checkpoint = torch.load(self.pretrained_dir, map_location='cpu')
         state_dict = checkpoint["state_dict"]
-        print(state_dict.keys())
         for k in list(state_dict.keys()):
             if k.startswith("module.backbone."):
                 if k.startswith("module.backbone") and not k.startswith("module.backbone.fc"):
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index a552849..76d724c 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -79,6 +79,7 @@
     "--log-every-n-steps", default=100, type=int, help="Log every n steps"
 )
 
+
 def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None:
     """Initialize worker processes with a random seed.
 
@@ -179,8 +180,14 @@ def main():
         pin_memory=False,
         drop_last=False,
     )
-
-    model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation)
+    if args.dataset_name == "cifar10":
+        num_classes = 10
+    elif args.dataset_name == "stl10":
+        num_classes = 10
+    elif args.dataset_name == "imagenet":
+        num_classes = 1000
+
+    model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation, num_classes=num_classes)
 
     if args.distributed_mode and dist_utils.is_dist_avail_and_initialized():
         # set the single device scope, otherwise DistributedDataParallel will

From ecfab47e3bdec13f6d074c09e9af8d4e26e79bac Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:38:52 -0800
Subject: [PATCH 19/57] check

---
 evaluate_simCLR.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 76d724c..343bfef 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -212,6 +212,9 @@ def main():
             y_batch = y_batch.cuda(device_id)
 
             logits = model(x_batch)
+            print(y_batch)
+            print(logits.shape)
+            print(logits)
             loss = criterion(logits, y_batch)
             top1 = accuracy(logits, y_batch, topk=(1,))
             top1_train_accuracy += top1[0]

From 20795afce425e2c80a2f644b8772214c256d8b86 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:46:16 -0800
Subject: [PATCH 20/57] update slrm

---
 eval_simclr.slrm   | 3 ++-
 evaluate_simCLR.py | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 6d52e84..cbeac19 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -8,7 +8,8 @@
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
 #SBATCH --mem-per-cpu=2G
-#SBATCH --output=slurm-%N-%j.out
+#SBATCH --output=./runs/eval_slurm-%N-%j.out
+#SBATCH --error=./runs/eval_slurm-%N-%j.err
 #SBATCH --qos=m
 
 PY_ARGS=${@:1}
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 343bfef..76d724c 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -212,9 +212,6 @@ def main():
             y_batch = y_batch.cuda(device_id)
 
             logits = model(x_batch)
-            print(y_batch)
-            print(logits.shape)
-            print(logits)
             loss = criterion(logits, y_batch)
             top1 = accuracy(logits, y_batch, topk=(1,))
             top1_train_accuracy += top1[0]

From 38a86e6cd508f24f7f92453441b1ff43b924b376 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 17:55:50 -0800
Subject: [PATCH 21/57] Update eval

---
 evaluate_simCLR.py | 61 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 76d724c..4702802 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -2,10 +2,12 @@
 import random
 from functools import partial
 
+import os
 import torch
 from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
 from torch.utils.data.distributed import DistributedSampler
 from torchvision import models
+import shutil
 
 from SimCLR import distributed as dist_utils
 from SimCLR.data_aug.supervised_dataset import SupervisedDataset
@@ -78,6 +80,8 @@
 parser.add_argument(
     "--log-every-n-steps", default=100, type=int, help="Log every n steps"
 )
+parser.add_argument("--model_dir", default="model_checkpoints")
+parser.add_argument("--experiment_name", default="simclr")
 
 
 def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None:
@@ -113,6 +117,11 @@ def accuracy(output, target, topk=(1,)):
             correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
             res.append(correct_k.mul_(100.0 / batch_size))
         return res
+    
+def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, "model_best.pth.tar")
 
 
 def main():
@@ -203,6 +212,10 @@ def main():
 
     n_iter = 0
 
+    log_dir = os.path.join(args.model_dir, args.experiment_name)
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+
     for epoch in range(args.epochs):
         if dist_utils.is_dist_avail_and_initialized():
             train_loader.sampler.set_epoch(epoch)
@@ -220,26 +233,42 @@ def main():
             loss.backward()
             optimizer.step()
             if n_iter % args.log_every_n_steps == 0:
-                top1_train_accuracy /= counter + 1
-                top1_accuracy = 0
-                top5_accuracy = 0
-                for counter, (x_batch, y_batch) in enumerate(test_loader):
-                    x_batch = x_batch.cuda(device_id)
-                    y_batch = y_batch.cuda(device_id)
-
-                    logits = model(x_batch)
-
-                    top1, top5 = accuracy(logits, y_batch, topk=(1, 5))
-                    top1_accuracy += top1[0]
-                    top5_accuracy += top5[0]
-
-                top1_accuracy /= counter + 1
-                top5_accuracy /= counter + 1
+                temp_train_acc = top1_train_accuracy / (counter + 1)
                 print(
-                    f"Epoch {epoch}\t Iter {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
+                    f"Iter {n_iter}\t Top1 Train accuracy {temp_train_acc.item()}",
                 )
             n_iter += 1
 
+        top1_train_accuracy /= counter + 1
+        top1_accuracy = 0
+        top5_accuracy = 0
+        for counter, (x_batch, y_batch) in enumerate(test_loader):
+            x_batch = x_batch.cuda(device_id)
+            y_batch = y_batch.cuda(device_id)
+
+            logits = model(x_batch)
+
+            top1, top5 = accuracy(logits, y_batch, topk=(1, 5))
+            top1_accuracy += top1[0]
+            top5_accuracy += top5[0]
+
+        top1_accuracy /= counter + 1
+        top5_accuracy /= counter + 1
+        print(
+            f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
+        )
+        checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch)
+        save_checkpoint(
+            {
+                "n_iter": n_iter,
+                "arch": args.arch,
+                "state_dict": model.state_dict(),
+                "optimizer": optimizer.state_dict(),
+            },
+            is_best=False,
+            filename=os.path.join(log_dir, checkpoint_name),
+        )
+
    
 
 

From d85de1708a7c651c4dd0dcbb474b2fad6ab0cdd1 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 18:00:16 -0800
Subject: [PATCH 22/57] edit

---
 eval_simclr.slrm | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index cbeac19..f7208b8 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -32,4 +32,6 @@ srun python evaluate_simCLR.py \
 --batch-size=256 \
 --pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \
 --log-every-n-steps=10 \
---linear_evaluation 
\ No newline at end of file
+--linear_evaluation \
+--model_dir="/ssd003/projects/aieng/genssl/experiments" \
+--experiment_name="simclr" 
\ No newline at end of file

From e359965c0f91b5b649261bbeaf5aad0e9911587e Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 18:02:53 -0800
Subject: [PATCH 23/57] edit slrm

---
 eval_simclr.slrm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index f7208b8..e99853e 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -8,8 +8,8 @@
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
 #SBATCH --mem-per-cpu=2G
-#SBATCH --output=./runs/eval_slurm-%N-%j.out
-#SBATCH --error=./runs/eval_slurm-%N-%j.err
+#SBATCH --output=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.out
+#SBATCH --error=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.err
 #SBATCH --qos=m
 
 PY_ARGS=${@:1}
@@ -31,7 +31,7 @@ srun python evaluate_simCLR.py \
 --distributed_mode \
 --batch-size=256 \
 --pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \
---log-every-n-steps=10 \
+--log-every-n-steps=100 \
 --linear_evaluation \
 --model_dir="/ssd003/projects/aieng/genssl/experiments" \
 --experiment_name="simclr" 
\ No newline at end of file

From c78abd06c780e907243dda80a7bc5164ed8521bc Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Wed, 31 Jan 2024 18:03:50 -0800
Subject: [PATCH 24/57] correct sample slrm

---
 eval_simclr.slrm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index e99853e..5c1010a 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -8,8 +8,8 @@
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
 #SBATCH --mem-per-cpu=2G
-#SBATCH --output=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.out
-#SBATCH --error=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.err
+#SBATCH --output=./runs/eval_slurm-%N-%j.out
+#SBATCH --error=./runs/eval_slurm-%N-%j.err
 #SBATCH --qos=m
 
 PY_ARGS=${@:1}

From a1fa4ea1abba151f53b0d69bff55a57a3f72625c Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Thu, 1 Feb 2024 11:13:19 -0800
Subject: [PATCH 25/57] fix multi gpu

---
 SimCLR/data_aug/rcdm_aug.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index 8ea9e23..43d6dda 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -16,7 +16,9 @@ def __init__(self, config, device_id):
         self.device_id = device_id
 
         # Load SSL model
-        self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval()
+        self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir)
+        self.ssl_model = self.ssl_model.cuda(self.device_id)
+        self.ssl_model.eval()
         for p in self.ssl_model.parameters():
             p.requires_grad = False
 
@@ -33,7 +35,7 @@ def __init__(self, config, device_id):
         else:
             trained_model = torch.load(self.config.model_path, map_location="cpu")
         self.model.load_state_dict(trained_model, strict=True)
-        self.model.cuda(self.device_id)
+        self.model = self.model.cuda(self.device_id)
 
     def __call__(self, img):
         """

From f96c12175866ade196cf4256a6adc2978afb81ca Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Mon, 12 Feb 2024 12:25:26 -0800
Subject: [PATCH 26/57] Delete pytest

---
 .pre-commit-config.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7d459c0..45f0c42 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,15 +40,6 @@ repos:
       - id: nbqa-ruff
         args: [--fix]
 
-  - repo: local
-    hooks:
-    - id: pytest
-      name: pytest
-      entry: python3 -m pytest
-      language: system
-      pass_filenames: false
-      always_run: true
-
 exclude: |
   (?x)(
       ^rcdm/|

From 73eb9e44567ffe4bf3375296f68fd83b54e0f2d9 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 09:20:05 -0800
Subject: [PATCH 27/57] update eval

---
 .../data_aug/contrastive_learning_dataset.py  |  1 -
 evaluate_simCLR.py                            | 51 ++++++++++++++-----
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index 8c67ff0..ef488b1 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -7,7 +7,6 @@
 from SimCLR.data_aug.rcdm_config import get_config
 from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator
 from SimCLR.exceptions.exceptions import InvalidDatasetSelection
-import random
 
 class ContrastiveLearningDataset:
     def __init__(self, root_folder):
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 4702802..8a9771a 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -51,7 +51,11 @@
     help="number of data loading workers",
 )
 parser.add_argument(
-    "--epochs", default=100, type=int, metavar="N", help="number of total epochs to run"
+    "--epochs",
+    default=100,
+    type=int,
+    metavar="N",
+    help="number of total epochs to run",
 )
 parser.add_argument(
     "-b",
@@ -63,23 +67,33 @@
     "batch size of all GPUs on the current node when "
     "using Data Parallel or Distributed Data Parallel",
 )
-
 parser.add_argument(
-    "--seed", default=42, type=int, help="seed for initializing training. "
+    "--seed",
+    default=42,
+    type=int,
+    help="seed for initializing training. ",
+)
+parser.add_argument(
+    "--log-every-n-steps",
+    default=100,
+    type=int,
+    help="Log every n steps",
 )
-
 parser.add_argument(
-    "--distributed_mode", action="store_true", help="Enable distributed training"
+    "--distributed_mode",
+    action="store_true",
+    help="Enable distributed training",
 )
 parser.add_argument("--distributed_launcher", default="slurm")
 parser.add_argument("--distributed_backend", default="nccl")
-parser.add_argument("--pretrained_model_file", default=None, help="Path to the pretrained model file.")
-parser.add_argument("--linear_evaluation", 
-                    action="store_true",
-                    help="Whether or not to evaluate the linear evaluation of the model.")
 parser.add_argument(
-    "--log-every-n-steps", default=100, type=int, help="Log every n steps"
-)
+    "--pretrained_model_file", 
+    default=None, 
+    help="Path to the pretrained model file.")
+parser.add_argument(
+    "--linear_evaluation", 
+    action="store_true",
+    help="Whether or not to evaluate the linear evaluation of the model.")
 parser.add_argument("--model_dir", default="model_checkpoints")
 parser.add_argument("--experiment_name", default="simclr")
 
@@ -196,7 +210,11 @@ def main():
     elif args.dataset_name == "imagenet":
         num_classes = 1000
 
-    model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation, num_classes=num_classes)
+    model = PretrainedResNet(
+        base_model=args.arch, 
+        pretrained_dir = args.pretrained_model_file, 
+        linear_eval=args.linear_evaluation, 
+        num_classes=num_classes)
 
     if args.distributed_mode and dist_utils.is_dist_avail_and_initialized():
         # set the single device scope, otherwise DistributedDataParallel will
@@ -207,7 +225,12 @@ def main():
     else:
         model = model.cuda()
 
-    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0008)
+    optimizer = torch.optim.Adam(
+        model.parameters(), 
+        lr=3e-4, 
+        weight_decay=0.0008,
+        )
+    
     criterion = torch.nn.CrossEntropyLoss().cuda(device_id)
 
     n_iter = 0
@@ -260,7 +283,7 @@ def main():
         checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch)
         save_checkpoint(
             {
-                "n_iter": n_iter,
+                "n_epoch": epoch,
                 "arch": args.arch,
                 "state_dict": model.state_dict(),
                 "optimizer": optimizer.state_dict(),

From e67b59831ce8bfaacdb052b54d344b63decbfd10 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 09:30:31 -0800
Subject: [PATCH 28/57] clean code

---
 SimCLR/data_aug/contrastive_learning_dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index ef488b1..dd89386 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -7,7 +7,6 @@
 from SimCLR.data_aug.rcdm_config import get_config
 from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator
 from SimCLR.exceptions.exceptions import InvalidDatasetSelection
-
 class ContrastiveLearningDataset:
     def __init__(self, root_folder):
         self.root_folder = root_folder

From 56688ca2d61b51669a9a05d3fd50421b4747a284 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 09:30:53 -0800
Subject: [PATCH 29/57] clean code

---
 SimCLR/data_aug/contrastive_learning_dataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index dd89386..e6fbb3c 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -7,6 +7,8 @@
 from SimCLR.data_aug.rcdm_config import get_config
 from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator
 from SimCLR.exceptions.exceptions import InvalidDatasetSelection
+
+
 class ContrastiveLearningDataset:
     def __init__(self, root_folder):
         self.root_folder = root_folder

From 084d05c96173638252b4ef000544d165d04b7dc0 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 09:31:14 -0800
Subject: [PATCH 30/57] clean code

---
 SimCLR/data_aug/contrastive_learning_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py
index e6fbb3c..c9bc448 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/data_aug/contrastive_learning_dataset.py
@@ -113,4 +113,5 @@ def get_dataset(
         except KeyError:
             raise InvalidDatasetSelection()
         else:
-            return dataset_fn()
\ No newline at end of file
+            return dataset_fn()
+        
\ No newline at end of file

From 1b9ccf35807ab4fcbd8e11c74230eeeff229d4c7 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 09:32:58 -0800
Subject: [PATCH 31/57] clean code

---
 SimCLR/simclr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py
index ecb2a80..e2f1eb7 100644
--- a/SimCLR/simclr.py
+++ b/SimCLR/simclr.py
@@ -63,7 +63,6 @@ def train(self, train_loader):
                         self.scheduler.get_last_lr()[0],
                         global_step=n_iter,
                     )
-                    
                 n_iter += 1
 
             # warmup for the first 10 epochs

From c707014586958f9774133905e300177e86d427db Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 09:33:48 -0800
Subject: [PATCH 32/57] clean code

---
 run_simCLR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_simCLR.py b/run_simCLR.py
index 4d2615c..de526c5 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -169,7 +169,7 @@ def main():
 
     assert (
         args.n_views == 2
-    ), "Only two view training is supported. Please use --n-views 2." 
+    ), "Only two view training is supported. Please use --n-views 2."
 
     if args.distributed_mode:
         dist_utils.init_distributed_mode(

From b36c694649aaf748cda95e5fcda19ac296ee4ac4 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 12:15:03 -0800
Subject: [PATCH 33/57] debug rcdm error

---
 SimCLR/data_aug/rcdm_aug.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index ccf2c9d..dd9f658 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -89,9 +89,11 @@ def __call__(self, img):
             if not self.config.use_ddim
             else self.diffusion.ddim_sample_loop
         )
-
+        print("1",img.shape)
         img = img.unsqueeze(0).repeat(1, 1, 1, 1)
+        print("2",img.shape)
         img = self.preprocess_input_image(img).cuda(self.device_id)
+        print("3",img.shape)
         model_kwargs = {}
 
         with torch.no_grad():

From cf9713d20a35e653469cce60507c363b9a1b1f4c Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 12:27:01 -0800
Subject: [PATCH 34/57] edit

---
 SimCLR/data_aug/rcdm_aug.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index dd9f658..bef1f84 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -89,6 +89,7 @@ def __call__(self, img):
             if not self.config.use_ddim
             else self.diffusion.ddim_sample_loop
         )
+        print(img)
         print("1",img.shape)
         img = img.unsqueeze(0).repeat(1, 1, 1, 1)
         print("2",img.shape)

From f8e8eb1d29cacddda1c2905b93bcab49281956d9 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 12:40:51 -0800
Subject: [PATCH 35/57] edit

---
 SimCLR/data_aug/rcdm_aug.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index bef1f84..dfd396f 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -89,12 +89,10 @@ def __call__(self, img):
             if not self.config.use_ddim
             else self.diffusion.ddim_sample_loop
         )
-        print(img)
+        img = self.preprocess_input_image(img).cuda(self.device_id)
         print("1",img.shape)
         img = img.unsqueeze(0).repeat(1, 1, 1, 1)
         print("2",img.shape)
-        img = self.preprocess_input_image(img).cuda(self.device_id)
-        print("3",img.shape)
         model_kwargs = {}
 
         with torch.no_grad():

From 575592344601464b211d3fa5036743723778def5 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 12:56:02 -0800
Subject: [PATCH 36/57] delete normalize

---
 SimCLR/data_aug/rcdm_aug.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index dfd396f..78b3072 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -63,7 +63,6 @@ def preprocess_input_image(self, input_image, size=224):
                 data_utils.CenterCropLongEdge(),
                 transforms.Resize((size, size)),
                 transforms.ToTensor(),
-                transforms.Normalize(self.config.norm_mean, self.config.norm_std),
             ]
         )
         tensor_image = transform_list(input_image)

From 25d250f9a4c9bbfad4c39782567e34206aeb5394 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 13:06:04 -0800
Subject: [PATCH 37/57] edit

---
 SimCLR/data_aug/rcdm_aug.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index 78b3072..9494118 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -90,7 +90,7 @@ def __call__(self, img):
         )
         img = self.preprocess_input_image(img).cuda(self.device_id)
         print("1",img.shape)
-        img = img.unsqueeze(0).repeat(1, 1, 1, 1)
+        img = img.repeat(1, 1, 1, 1)
         print("2",img.shape)
         model_kwargs = {}
 

From 5754ae170b53518b597b6628ef7d5a425000dea4 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 13 Feb 2024 13:18:22 -0800
Subject: [PATCH 38/57] delete print

---
 SimCLR/data_aug/rcdm_aug.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py
index 9494118..4cb0ed2 100644
--- a/SimCLR/data_aug/rcdm_aug.py
+++ b/SimCLR/data_aug/rcdm_aug.py
@@ -89,9 +89,7 @@ def __call__(self, img):
             else self.diffusion.ddim_sample_loop
         )
         img = self.preprocess_input_image(img).cuda(self.device_id)
-        print("1",img.shape)
         img = img.repeat(1, 1, 1, 1)
-        print("2",img.shape)
         model_kwargs = {}
 
         with torch.no_grad():
@@ -104,5 +102,4 @@ def __call__(self, img):
             model_kwargs=model_kwargs,
         )
 
-        print("Sampling completed!")
         return sample.squeeze(0)

From 0642905efc31a38d54911b7591245e18639d04cb Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 20 Feb 2024 10:52:57 -0800
Subject: [PATCH 39/57] update evaluation

---
 SimCLR/data_aug/supervised_dataset.py |  2 +-
 SimCLR/models/resnet_pretrained.py    |  6 +-
 SimCLR/simclr.py                      |  3 +-
 eval_simclr.slrm                      |  8 +--
 evaluate_simCLR.py                    | 80 +++++++++++++++------------
 run_simCLR.py                         | 11 ++++
 tests/test_evaluation.py              | 34 ++++++++++++
 7 files changed, 97 insertions(+), 47 deletions(-)
 create mode 100644 tests/test_evaluation.py

diff --git a/SimCLR/data_aug/supervised_dataset.py b/SimCLR/data_aug/supervised_dataset.py
index 6b1713a..774a113 100644
--- a/SimCLR/data_aug/supervised_dataset.py
+++ b/SimCLR/data_aug/supervised_dataset.py
@@ -16,7 +16,7 @@ def get_transform(size):
             size (int): Image size.
         """
         transform_list = [
-            transforms.Resize(size=(size,size)),
+            transforms.CenterCrop(size=size),
             transforms.ToTensor(),
         ]
 
diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index 4c3ea45..840577a 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -6,10 +6,10 @@
 
 
 class PretrainedResNet(nn.Module):
-    def __init__(self, base_model, pretrained_dir, linear_eval=True, num_classes=10):
+    def __init__(self, base_model, pretrained_model_file, linear_eval=True, num_classes=10):
         super(PretrainedResNet, self).__init__()
 
-        self.pretrained_dir = pretrained_dir
+        self.pretrained_model_file = pretrained_model_file
 
         self.resnet_dict = {
             "resnet18": models.resnet18(pretrained=False, num_classes=num_classes),
@@ -30,7 +30,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True, num_classes=10)
             assert len(parameters) == 2  # fc.weight, fc.bias
 
     def _load_pretrained(self):
-        checkpoint = torch.load(self.pretrained_dir, map_location='cpu')
+        checkpoint = torch.load(self.pretrained_model_file, map_location='cpu')
         state_dict = checkpoint["state_dict"]
         for k in list(state_dict.keys()):
             if k.startswith("module.backbone."):
diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py
index e2f1eb7..21a8652 100644
--- a/SimCLR/simclr.py
+++ b/SimCLR/simclr.py
@@ -12,13 +12,12 @@
 
 
 class SimCLR(object):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, log_dir, *args, **kwargs):
         self.args = kwargs["args"]
         self.model = kwargs["model"]
         self.optimizer = kwargs["optimizer"]
         self.scheduler = kwargs["scheduler"]
         self.device_id = kwargs["device_id"]
-        log_dir = os.path.join(self.args.model_dir, self.args.experiment_name)
         self.writer = SummaryWriter(log_dir)
         self.criterion = loss.SimCLRContrastiveLoss(self.args.temperature).cuda(
             self.device_id
diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 5c1010a..0b93ac3 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -30,8 +30,6 @@ nvidia-smi
 srun python evaluate_simCLR.py \
 --distributed_mode \
 --batch-size=256 \
---pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \
---log-every-n-steps=100 \
---linear_evaluation \
---model_dir="/ssd003/projects/aieng/genssl/experiments" \
---experiment_name="simclr" 
\ No newline at end of file
+--pretrained_model_dir="/ssd003/projects/aieng/genssl/experiments/simclr" \
+--pretrained_model_name='/checkpoint_epoch_0003.pth.tar' \
+--linear_evaluation 
\ No newline at end of file
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 8a9771a..ee15949 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -12,7 +12,6 @@
 from SimCLR import distributed as dist_utils
 from SimCLR.data_aug.supervised_dataset import SupervisedDataset
 from SimCLR.models.resnet_pretrained import PretrainedResNet
-from SimCLR.simclr import SimCLR
 
 
 model_names = sorted(
@@ -67,18 +66,30 @@
     "batch size of all GPUs on the current node when "
     "using Data Parallel or Distributed Data Parallel",
 )
+parser.add_argument(
+    "--lr",
+    "--learning-rate",
+    default=0.0003,
+    type=float,
+    metavar="LR",
+    help="initial learning rate",
+    dest="lr",
+)
+parser.add_argument(
+    "--wd",
+    "--weight-decay",
+    default=8e-4,
+    type=float,
+    metavar="W",
+    help="weight decay (default: 1e-4)",
+    dest="weight_decay",
+)
 parser.add_argument(
     "--seed",
     default=42,
     type=int,
     help="seed for initializing training. ",
 )
-parser.add_argument(
-    "--log-every-n-steps",
-    default=100,
-    type=int,
-    help="Log every n steps",
-)
 parser.add_argument(
     "--distributed_mode",
     action="store_true",
@@ -87,15 +98,21 @@
 parser.add_argument("--distributed_launcher", default="slurm")
 parser.add_argument("--distributed_backend", default="nccl")
 parser.add_argument(
-    "--pretrained_model_file", 
+    "--pretrained_model_dir", 
+    default=None, 
+    help="Path to the pretrained model directory.")
+parser.add_argument(
+    "--pretrained_model_name", 
     default=None, 
-    help="Path to the pretrained model file.")
+    help="Name of pretrained model.")
 parser.add_argument(
     "--linear_evaluation", 
     action="store_true",
     help="Whether or not to evaluate the linear evaluation of the model.")
-parser.add_argument("--model_dir", default="model_checkpoints")
-parser.add_argument("--experiment_name", default="simclr")
+parser.add_argument(
+    "--enable_checkpointing", 
+    action="store_true",
+    help="Whether or not to enable checkpointing of the model.")
 
 
 def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None:
@@ -132,10 +149,8 @@ def accuracy(output, target, topk=(1,)):
             res.append(correct_k.mul_(100.0 / batch_size))
         return res
     
-def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"):
+def save_checkpoint(state, filename="checkpoint.pth.tar"):
     torch.save(state, filename)
-    if is_best:
-        shutil.copyfile(filename, "model_best.pth.tar")
 
 
 def main():
@@ -212,7 +227,7 @@ def main():
 
     model = PretrainedResNet(
         base_model=args.arch, 
-        pretrained_dir = args.pretrained_model_file, 
+        pretrained_model_file = os.path.join(args.pretrained_model_dir, args.pretrained_model_name), 
         linear_eval=args.linear_evaluation, 
         num_classes=num_classes)
 
@@ -227,17 +242,15 @@ def main():
 
     optimizer = torch.optim.Adam(
         model.parameters(), 
-        lr=3e-4, 
-        weight_decay=0.0008,
+        lr=args.lr, 
+        weight_decay=args.weight_decay,
         )
     
     criterion = torch.nn.CrossEntropyLoss().cuda(device_id)
 
     n_iter = 0
 
-    log_dir = os.path.join(args.model_dir, args.experiment_name)
-    if not os.path.exists(log_dir):
-        os.mkdir(log_dir)
+    log_dir = args.pretrained_model_dir
 
     for epoch in range(args.epochs):
         if dist_utils.is_dist_avail_and_initialized():
@@ -255,11 +268,6 @@ def main():
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
-            if n_iter % args.log_every_n_steps == 0:
-                temp_train_acc = top1_train_accuracy / (counter + 1)
-                print(
-                    f"Iter {n_iter}\t Top1 Train accuracy {temp_train_acc.item()}",
-                )
             n_iter += 1
 
         top1_train_accuracy /= counter + 1
@@ -280,17 +288,17 @@ def main():
         print(
             f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
         )
-        checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch)
-        save_checkpoint(
-            {
-                "n_epoch": epoch,
-                "arch": args.arch,
-                "state_dict": model.state_dict(),
-                "optimizer": optimizer.state_dict(),
-            },
-            is_best=False,
-            filename=os.path.join(log_dir, checkpoint_name),
-        )
+        if args.enable_checkpointing:
+            checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch)
+            save_checkpoint(
+                {
+                    "n_epoch": epoch,
+                    "arch": args.arch,
+                    "state_dict": model.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                },
+                filename=os.path.join(log_dir, checkpoint_name),
+            )
 
    
 
diff --git a/run_simCLR.py b/run_simCLR.py
index 22b8601..4f52375 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -3,6 +3,8 @@
 import random
 from functools import partial
 
+import os
+from datetime import datetime
 import torch
 from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
 from torch.utils.data.distributed import DistributedSampler
@@ -173,6 +175,14 @@ def main():
     args = parser.parse_args()
     print(args)
 
+    # Create a directory to save the model checkpoints and logs
+    now = datetime.now()
+    dt_string = now.strftime("%d/%m/%Y_%H:%M")
+    log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string)
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # Set the start method to spawn for distributed training
     torch.multiprocessing.set_start_method("spawn")
 
     assert (
@@ -252,6 +262,7 @@ def main():
     )
 
     simclr = SimCLR(
+        log_dir=log_dir,
         model=model,
         optimizer=optimizer,
         scheduler=scheduler,
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
new file mode 100644
index 0000000..b0cfb31
--- /dev/null
+++ b/tests/test_evaluation.py
@@ -0,0 +1,34 @@
+import pytest
+import torch
+from evaluate_simCLR import accuracy
+
+def test_accuracy()-> None:
+    # Create sample data
+    output = torch.tensor([[0.1, 0.5, 0.3], [0.2, 0.6, 0.2]])
+    target = torch.tensor([1, 2])
+    topk = (1,)
+
+    # Calculate accuracy
+    res = accuracy(output, target, topk=topk)
+
+    # Check if the result matches the expected accuracy
+    expected_accuracy = [50.0]
+    assert res == expected_accuracy
+
+def test_accuracy_topk_5():
+    # Create sample data
+    output = torch.tensor([[0.1, 0.5, 0.3, 0.1, 0.4, 0.5, 0.2, 0.3, 0.1, 0.9], 
+                           [0.2, 0.6, 0.2, 0.1, 0.3, 0.6, 0.2, 0.4, 0.1, 0.8],
+                           [0.3, 0.4, 0.3, 0.2, 0.5, 0.3, 0.1, 0.7, 0.2, 0.6],
+                           [0.4, 0.3, 0.3, 0.5, 0.6, 0.1, 0.2, 0.8, 0.1, 0.7]])
+    target = torch.tensor([6, 7, 8, 9])  # Targets that are not in the top 5
+    topk = (5,)
+
+    # Calculate accuracy
+    res = accuracy(output, target, topk=topk)
+    print(res)
+
+    # Check if the result matches the expected accuracy
+    # In this case, the expected accuracy is 25.0 for all samples
+    expected_accuracy = [50.0]
+    assert res == expected_accuracy 
\ No newline at end of file

From 245eb54e6fcb587e69110853d1a5a659508a46d8 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 20 Feb 2024 10:54:29 -0800
Subject: [PATCH 40/57] update formating

---
 eval_simclr.slrm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 0b93ac3..46ba26b 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -31,5 +31,5 @@ srun python evaluate_simCLR.py \
 --distributed_mode \
 --batch-size=256 \
 --pretrained_model_dir="/ssd003/projects/aieng/genssl/experiments/simclr" \
---pretrained_model_name='/checkpoint_epoch_0003.pth.tar' \
+--pretrained_model_name="/checkpoint_epoch_0003.pth.tar" \
 --linear_evaluation 
\ No newline at end of file

From 33c0c9386b30936567d2d2b1d737113592bedbe0 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 09:41:24 -0800
Subject: [PATCH 41/57] update logging

---
 run_simCLR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_simCLR.py b/run_simCLR.py
index 4f52375..b4f51d2 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -177,7 +177,7 @@ def main():
 
     # Create a directory to save the model checkpoints and logs
     now = datetime.now()
-    dt_string = now.strftime("%d/%m/%Y_%H:%M")
+    dt_string = now.strftime("%d_%m_%Y_%H:%M")
     log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string)
     if not os.path.exists(log_dir):
         os.makedirs(log_dir)

From 31409dd0aa851c2680cbeeec019d2864fb340aae Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 10:24:54 -0800
Subject: [PATCH 42/57] Update bash files

---
 eval_simclr.slrm   | 25 +++++++++++++++++--------
 evaluate_simCLR.py |  6 +++++-
 run_simCLR.py      |  2 +-
 train_simclr.slrm  | 15 +++++++++------
 4 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 46ba26b..bb70619 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -15,7 +15,7 @@
 PY_ARGS=${@:1}
 
 # load virtual environment
-source /ssd003/projects/aieng/envs/genssl/bin/activate
+source /ssd003/projects/aieng/envs/genssl2/bin/activate
 
 export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend
 export CUDA_LAUNCH_BLOCKING=1
@@ -26,10 +26,19 @@ export MASTER_PORT=45679
 export PYTHONPATH="."
 nvidia-smi
 
-# “srun” executes the script <ntasks-per-node * nodes> times
-srun python evaluate_simCLR.py \
---distributed_mode \
---batch-size=256 \
---pretrained_model_dir="/ssd003/projects/aieng/genssl/experiments/simclr" \
---pretrained_model_name="/checkpoint_epoch_0003.pth.tar" \
---linear_evaluation 
\ No newline at end of file
+files=$(ls checkpoint_epoch_*)
+
+# Loop through each file and pass it as a parameter to the rest of the script
+for file in $files
+do
+    # “srun” executes the script <ntasks-per-node * nodes> times
+    srun python evaluate_simCLR.py \
+    --distributed_mode \
+    --batch-size=256 \
+    --pretrained_model_dir="/projects/imagenet_synthetic/train_models" \
+    --experiment_name="simclr/23_02_2024_13:02" \
+    --pretrained_model_name=$file \
+    --linear_evaluation \
+    --arch="resnet50"
+    # Add your processing logic here
+done
\ No newline at end of file
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index ee15949..a87bb83 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -105,6 +105,10 @@
     "--pretrained_model_name", 
     default=None, 
     help="Name of pretrained model.")
+parser.add_argument(
+    "--experiment_name",
+    default=None,
+    help="Name of the experiment.")
 parser.add_argument(
     "--linear_evaluation", 
     action="store_true",
@@ -227,7 +231,7 @@ def main():
 
     model = PretrainedResNet(
         base_model=args.arch, 
-        pretrained_model_file = os.path.join(args.pretrained_model_dir, args.pretrained_model_name), 
+        pretrained_model_file = os.path.join(args.pretrained_model_dir, args.experiment_name, args.pretrained_model_name), 
         linear_eval=args.linear_evaluation, 
         num_classes=num_classes)
 
diff --git a/run_simCLR.py b/run_simCLR.py
index b4f51d2..02c624a 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -260,7 +260,7 @@ def main():
     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
         optimizer, T_max=len(train_loader), eta_min=0, last_epoch=-1
     )
-
+    print(device_id,flush=True)
     simclr = SimCLR(
         log_dir=log_dir,
         model=model,
diff --git a/train_simclr.slrm b/train_simclr.slrm
index 477a1fb..cf0cf25 100644
--- a/train_simclr.slrm
+++ b/train_simclr.slrm
@@ -1,14 +1,15 @@
 #!/bin/bash
 
 #SBATCH --job-name=train_sunrgbd
-#SBATCH --partition=t4v2
-#SBATCH --time=12:00:00
+#SBATCH --partition=a100
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:4
 #SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
-#SBATCH --mem-per-cpu=2G
-#SBATCH --output=slurm-%N-%j.out
+#SBATCH --mem-per-cpu=8G
+#SBATCH --output=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.out
+#SBATCH --error=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.err
+#SBATCH --qos=a100_arashaf
 
 PY_ARGS=${@:1}
 
@@ -28,5 +29,7 @@ nvidia-smi
 srun python run_simCLR.py \
 --fp16-precision \
 --distributed_mode \
---batch-size=4 \
---icgan_augmentation
+--batch-size=256 \
+--model_dir="/projects/imagenet_synthetic/train_models" \
+--experiment_name="simclr" \
+--arch="resnet50"

From eb9a4b6633a56b823263aacc0040ea2a1e121df4 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 11:44:54 -0800
Subject: [PATCH 43/57] Update augmentation and saving file

---
 .../contrastive_learning_dataset.py           | 12 ++---
 SimCLR/datasets/data_aug/center_crop.py       | 44 +++++++++++++++++++
 .../{ => datasets}/data_aug/gaussian_blur.py  |  0
 SimCLR/{ => datasets}/data_aug/icgan_aug.py   |  0
 .../{ => datasets}/data_aug/icgan_config.py   |  0
 SimCLR/{ => datasets}/data_aug/rcdm_aug.py    |  0
 SimCLR/{ => datasets}/data_aug/rcdm_config.py |  0
 .../supervised_dataset.py                     |  5 ++-
 .../{data_aug => datasets}/view_generator.py  |  0
 eval_simclr.slrm                              | 17 +++----
 evaluate_simCLR.py                            |  3 +-
 run_simCLR.py                                 |  5 +--
 train_simclr.slrm                             | 17 +++----
 13 files changed, 75 insertions(+), 28 deletions(-)
 rename SimCLR/{data_aug => datasets}/contrastive_learning_dataset.py (91%)
 create mode 100644 SimCLR/datasets/data_aug/center_crop.py
 rename SimCLR/{ => datasets}/data_aug/gaussian_blur.py (100%)
 rename SimCLR/{ => datasets}/data_aug/icgan_aug.py (100%)
 rename SimCLR/{ => datasets}/data_aug/icgan_config.py (100%)
 rename SimCLR/{ => datasets}/data_aug/rcdm_aug.py (100%)
 rename SimCLR/{ => datasets}/data_aug/rcdm_config.py (100%)
 rename SimCLR/{data_aug => datasets}/supervised_dataset.py (91%)
 rename SimCLR/{data_aug => datasets}/view_generator.py (100%)

diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/datasets/contrastive_learning_dataset.py
similarity index 91%
rename from SimCLR/data_aug/contrastive_learning_dataset.py
rename to SimCLR/datasets/contrastive_learning_dataset.py
index c9bc448..3723652 100644
--- a/SimCLR/data_aug/contrastive_learning_dataset.py
+++ b/SimCLR/datasets/contrastive_learning_dataset.py
@@ -1,11 +1,11 @@
 from torchvision import datasets, transforms
 
-from SimCLR.data_aug.gaussian_blur import GaussianBlur
-from SimCLR.data_aug.icgan_aug import ICGANInference
-from SimCLR.data_aug.icgan_config import get_icgan_config
-from SimCLR.data_aug.rcdm_aug import RCDMInference
-from SimCLR.data_aug.rcdm_config import get_config
-from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator
+from SimCLR.datasets.data_aug.gaussian_blur import GaussianBlur
+from SimCLR.datasets.data_aug.icgan_aug import ICGANInference
+from SimCLR.datasets.data_aug.icgan_config import get_icgan_config
+from SimCLR.datasets.data_aug.rcdm_aug import RCDMInference
+from SimCLR.datasets.data_aug.rcdm_config import get_config
+from SimCLR.datasets.view_generator import ContrastiveLearningViewGenerator
 from SimCLR.exceptions.exceptions import InvalidDatasetSelection
 
 
diff --git a/SimCLR/datasets/data_aug/center_crop.py b/SimCLR/datasets/data_aug/center_crop.py
new file mode 100644
index 0000000..783749c
--- /dev/null
+++ b/SimCLR/datasets/data_aug/center_crop.py
@@ -0,0 +1,44 @@
+import torch.nn.functional as F
+import torchvision
+import torch
+
+class CostumeCenterCrop(torch.nn.Module):
+    def __init__(self, size=None, ratio="1:1"):
+        super().__init__()
+        self.size = size
+        self.ratio = ratio
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        if self.size is None:
+            if isinstance(img, torch.Tensor):
+                h, w = img.shape[-2:]
+            else:
+                w, h = img.size
+            ratio = self.ratio.split(":")
+            ratio = float(ratio[0]) / float(ratio[1])
+            # Size must match the ratio while cropping to the edge of the image
+            ratioed_w = int(h * ratio)
+            ratioed_h = int(w / ratio)
+            if w>=h:
+                if ratioed_h <= h:
+                    size = (ratioed_h, w)
+                else:
+                    size = (h, ratioed_w)
+            else:
+                if ratioed_w <= w:
+                    size = (h, ratioed_w)
+                else:
+                    size = (ratioed_h, w)
+        else:
+            size = self.size
+        return torchvision.transforms.functional.center_crop(img, size)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+        
\ No newline at end of file
diff --git a/SimCLR/data_aug/gaussian_blur.py b/SimCLR/datasets/data_aug/gaussian_blur.py
similarity index 100%
rename from SimCLR/data_aug/gaussian_blur.py
rename to SimCLR/datasets/data_aug/gaussian_blur.py
diff --git a/SimCLR/data_aug/icgan_aug.py b/SimCLR/datasets/data_aug/icgan_aug.py
similarity index 100%
rename from SimCLR/data_aug/icgan_aug.py
rename to SimCLR/datasets/data_aug/icgan_aug.py
diff --git a/SimCLR/data_aug/icgan_config.py b/SimCLR/datasets/data_aug/icgan_config.py
similarity index 100%
rename from SimCLR/data_aug/icgan_config.py
rename to SimCLR/datasets/data_aug/icgan_config.py
diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/datasets/data_aug/rcdm_aug.py
similarity index 100%
rename from SimCLR/data_aug/rcdm_aug.py
rename to SimCLR/datasets/data_aug/rcdm_aug.py
diff --git a/SimCLR/data_aug/rcdm_config.py b/SimCLR/datasets/data_aug/rcdm_config.py
similarity index 100%
rename from SimCLR/data_aug/rcdm_config.py
rename to SimCLR/datasets/data_aug/rcdm_config.py
diff --git a/SimCLR/data_aug/supervised_dataset.py b/SimCLR/datasets/supervised_dataset.py
similarity index 91%
rename from SimCLR/data_aug/supervised_dataset.py
rename to SimCLR/datasets/supervised_dataset.py
index 774a113..51efb69 100644
--- a/SimCLR/data_aug/supervised_dataset.py
+++ b/SimCLR/datasets/supervised_dataset.py
@@ -2,7 +2,7 @@
 from torchvision.transforms import transforms
 
 from SimCLR.exceptions.exceptions import InvalidDatasetSelection
-import random
+from SimCLR.datasets.data_aug.center_crop import CostumeCenterCrop
 
 class SupervisedDataset:
     def __init__(self, root_folder):
@@ -16,7 +16,8 @@ def get_transform(size):
             size (int): Image size.
         """
         transform_list = [
-            transforms.CenterCrop(size=size),
+            CostumeCenterCrop(),
+            transforms.Resize((size, size)),
             transforms.ToTensor(),
         ]
 
diff --git a/SimCLR/data_aug/view_generator.py b/SimCLR/datasets/view_generator.py
similarity index 100%
rename from SimCLR/data_aug/view_generator.py
rename to SimCLR/datasets/view_generator.py
diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index bb70619..0d2164c 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -5,11 +5,11 @@
 #SBATCH --time=12:00:00
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:4
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=4
-#SBATCH --mem-per-cpu=2G
-#SBATCH --output=./runs/eval_slurm-%N-%j.out
-#SBATCH --error=./runs/eval_slurm-%N-%j.err
+#SBATCH --mem=100G
+#SBATCH --output=./logs/simclr/eval_slurm-%N-%j.out
+#SBATCH --error=./logs/simclr/eval_slurm-%N-%j.err
 #SBATCH --qos=m
 
 PY_ARGS=${@:1}
@@ -17,7 +17,7 @@ PY_ARGS=${@:1}
 # load virtual environment
 source /ssd003/projects/aieng/envs/genssl2/bin/activate
 
-export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend
 export CUDA_LAUNCH_BLOCKING=1
 
 export MASTER_ADDR=$(hostname)
@@ -31,12 +31,13 @@ files=$(ls checkpoint_epoch_*)
 # Loop through each file and pass it as a parameter to the rest of the script
 for file in $files
 do
-    # “srun” executes the script <ntasks-per-node * nodes> times
-    srun python evaluate_simCLR.py \
+    # torchrun execute nproc-per-node * nodes times
+    torchrun --nnodes 1 --nproc-per-node 4 evaluate_simCLR.py \
     --distributed_mode \
+    --distributed_launcher="pytorch" \
     --batch-size=256 \
     --pretrained_model_dir="/projects/imagenet_synthetic/train_models" \
-    --experiment_name="simclr/23_02_2024_13:02" \
+    --experiment_name="simclr/2024_02_23_13_02" \
     --pretrained_model_name=$file \
     --linear_evaluation \
     --arch="resnet50"
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index a87bb83..36b5d01 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -10,7 +10,7 @@
 import shutil
 
 from SimCLR import distributed as dist_utils
-from SimCLR.data_aug.supervised_dataset import SupervisedDataset
+from SimCLR.datasets.supervised_dataset import SupervisedDataset
 from SimCLR.models.resnet_pretrained import PretrainedResNet
 
 
@@ -291,6 +291,7 @@ def main():
         top5_accuracy /= counter + 1
         print(
             f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
+            flush=True,
         )
         if args.enable_checkpointing:
             checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch)
diff --git a/run_simCLR.py b/run_simCLR.py
index 02c624a..3d74b29 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -11,7 +11,7 @@
 from torchvision import models
 
 from SimCLR import distributed as dist_utils
-from SimCLR.data_aug.contrastive_learning_dataset import ContrastiveLearningDataset
+from SimCLR.datasets.contrastive_learning_dataset import ContrastiveLearningDataset
 from SimCLR.models.resnet_simclr import ResNetSimCLR
 from SimCLR.simclr import SimCLR
 from torch.utils.data import Subset
@@ -177,7 +177,7 @@ def main():
 
     # Create a directory to save the model checkpoints and logs
     now = datetime.now()
-    dt_string = now.strftime("%d_%m_%Y_%H:%M")
+    dt_string = now.strftime("%Y_%m_%d_%H_%M")
     log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string)
     if not os.path.exists(log_dir):
         os.makedirs(log_dir)
@@ -260,7 +260,6 @@ def main():
     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
         optimizer, T_max=len(train_loader), eta_min=0, last_epoch=-1
     )
-    print(device_id,flush=True)
     simclr = SimCLR(
         log_dir=log_dir,
         model=model,
diff --git a/train_simclr.slrm b/train_simclr.slrm
index cf0cf25..01f500e 100644
--- a/train_simclr.slrm
+++ b/train_simclr.slrm
@@ -4,11 +4,11 @@
 #SBATCH --partition=a100
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:4
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=4
-#SBATCH --mem-per-cpu=8G
-#SBATCH --output=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.out
-#SBATCH --error=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.err
+#SBATCH --mem=100G
+#SBATCH --output=./logs/simclr/slurm-%N-%j.out
+#SBATCH --error=./logs/simclr/slurm-%N-%j.err
 #SBATCH --qos=a100_arashaf
 
 PY_ARGS=${@:1}
@@ -16,7 +16,7 @@ PY_ARGS=${@:1}
 # load virtual environment
 source /ssd003/projects/aieng/envs/genssl2/bin/activate
 
-export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend
 export CUDA_LAUNCH_BLOCKING=1
 
 export MASTER_ADDR=$(hostname)
@@ -25,11 +25,12 @@ export MASTER_PORT=45679
 export PYTHONPATH="."
 nvidia-smi
 
-# “srun” executes the script <ntasks-per-node * nodes> times
-srun python run_simCLR.py \
+# torchrun execute nproc-per-node * nodes times
+torchrun --nnodes 1 --nproc-per-node 4 run_simCLR.py \
 --fp16-precision \
 --distributed_mode \
+--distributed_launcher="pytorch" \
 --batch-size=256 \
 --model_dir="/projects/imagenet_synthetic/train_models" \
 --experiment_name="simclr" \
---arch="resnet50"
+--arch="resnet50"
\ No newline at end of file

From 288f749404861c71cce4e5ca1cf53e631f3b2cb6 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:03:27 -0800
Subject: [PATCH 44/57] update evaluation

---
 eval_simclr.slrm   | 11 ++++++-----
 evaluate_simCLR.py | 10 +++++-----
 train_simclr.slrm  |  4 ++--
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 0d2164c..2e8bb00 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -1,16 +1,15 @@
 #!/bin/bash
 
 #SBATCH --job-name=train_sunrgbd
-#SBATCH --partition=t4v2
-#SBATCH --time=12:00:00
+#SBATCH --partition=a100
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:4
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=4
 #SBATCH --mem=100G
-#SBATCH --output=./logs/simclr/eval_slurm-%N-%j.out
-#SBATCH --error=./logs/simclr/eval_slurm-%N-%j.err
-#SBATCH --qos=m
+#SBATCH --output=logs/simclr/eval_slurm-%N-%j.out
+#SBATCH --error=logs/simclr/eval_slurm-%N-%j.err
+#SBATCH --qos=a100_arashaf
 
 PY_ARGS=${@:1}
 
@@ -31,6 +30,8 @@ files=$(ls checkpoint_epoch_*)
 # Loop through each file and pass it as a parameter to the rest of the script
 for file in $files
 do
+    echo "Evaluating: $file"
+
     # torchrun execute nproc-per-node * nodes times
     torchrun --nnodes 1 --nproc-per-node 4 evaluate_simCLR.py \
     --distributed_mode \
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 36b5d01..42e8a19 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -7,7 +7,7 @@
 from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
 from torch.utils.data.distributed import DistributedSampler
 from torchvision import models
-import shutil
+from tqdm import tqdm
 
 from SimCLR import distributed as dist_utils
 from SimCLR.datasets.supervised_dataset import SupervisedDataset
@@ -256,9 +256,9 @@ def main():
 
     log_dir = args.pretrained_model_dir
 
-    for epoch in range(args.epochs):
+    for epoch_counter in tqdm(range(args.epochs), desc="Training Progress"):
         if dist_utils.is_dist_avail_and_initialized():
-            train_loader.sampler.set_epoch(epoch)
+            train_loader.sampler.set_epoch(epoch_counter)
         top1_train_accuracy = 0
         for counter, (x_batch, y_batch) in enumerate(train_loader):
             x_batch = x_batch.cuda(device_id)
@@ -294,10 +294,10 @@ def main():
             flush=True,
         )
         if args.enable_checkpointing:
-            checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch)
+            checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch_counter)
             save_checkpoint(
                 {
-                    "n_epoch": epoch,
+                    "n_epoch": epoch_counter,
                     "arch": args.arch,
                     "state_dict": model.state_dict(),
                     "optimizer": optimizer.state_dict(),
diff --git a/train_simclr.slrm b/train_simclr.slrm
index 01f500e..b5d0acb 100644
--- a/train_simclr.slrm
+++ b/train_simclr.slrm
@@ -7,8 +7,8 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=4
 #SBATCH --mem=100G
-#SBATCH --output=./logs/simclr/slurm-%N-%j.out
-#SBATCH --error=./logs/simclr/slurm-%N-%j.err
+#SBATCH --output=logs/simclr/slurm-%N-%j.out
+#SBATCH --error=logs/simclr/slurm-%N-%j.err
 #SBATCH --qos=a100_arashaf
 
 PY_ARGS=${@:1}

From 3b43d4e277fb59f360b93ffe9ada9c92ea1be017 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:12:43 -0800
Subject: [PATCH 45/57] Update bash file

---
 eval_simclr.slrm | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index 2e8bb00..eeb9d63 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -25,7 +25,10 @@ export MASTER_PORT=45679
 export PYTHONPATH="."
 nvidia-smi
 
-files=$(ls checkpoint_epoch_*)
+pretrained_model_dir="/projects/imagenet_synthetic/train_models"
+experiment_name="simclr/2024_02_23_13_02"
+
+files=$(ls $pretrained_model_dir/$experiment_name/checkpoint_epoch_*)
 
 # Loop through each file and pass it as a parameter to the rest of the script
 for file in $files
@@ -37,8 +40,8 @@ do
     --distributed_mode \
     --distributed_launcher="pytorch" \
     --batch-size=256 \
-    --pretrained_model_dir="/projects/imagenet_synthetic/train_models" \
-    --experiment_name="simclr/2024_02_23_13_02" \
+    --pretrained_model_dir=$pretrained_model_dir \
+    --experiment_name=$experiment_name \
     --pretrained_model_name=$file \
     --linear_evaluation \
     --arch="resnet50"

From e64fe5c5bcd01354d64167e4357b2a2b1b311064 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:22:49 -0800
Subject: [PATCH 46/57] edit eval

---
 eval_simclr.slrm | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index eeb9d63..e2e1d7a 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -28,7 +28,11 @@ nvidia-smi
 pretrained_model_dir="/projects/imagenet_synthetic/train_models"
 experiment_name="simclr/2024_02_23_13_02"
 
-files=$(ls $pretrained_model_dir/$experiment_name/checkpoint_epoch_*)
+cd $pretrained_model_dir/$experiment_name
+
+files=$(ls checkpoint_epoch_*)
+
+cd "$OLDPWD"
 
 # Loop through each file and pass it as a parameter to the rest of the script
 for file in $files

From ef6a2147bd0d29d4cedd04de0b2058872be4107d Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:27:59 -0800
Subject: [PATCH 47/57] check loading

---
 SimCLR/models/resnet_pretrained.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index 840577a..ef088a0 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -10,6 +10,7 @@ def __init__(self, base_model, pretrained_model_file, linear_eval=True, num_clas
         super(PretrainedResNet, self).__init__()
 
         self.pretrained_model_file = pretrained_model_file
+        print(self.pretrained_model_file, flush=True)
 
         self.resnet_dict = {
             "resnet18": models.resnet18(pretrained=False, num_classes=num_classes),

From cef45728bae2df346aa63dbb31deb0d7321db260 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:41:08 -0800
Subject: [PATCH 48/57] debug eval

---
 evaluate_simCLR.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 42e8a19..95029b8 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -227,6 +227,7 @@ def main():
     elif args.dataset_name == "stl10":
         num_classes = 10
     elif args.dataset_name == "imagenet":
+        print("Using ImageNet dataset", flush=True)
         num_classes = 1000
 
     model = PretrainedResNet(
@@ -234,6 +235,8 @@ def main():
         pretrained_model_file = os.path.join(args.pretrained_model_dir, args.experiment_name, args.pretrained_model_name), 
         linear_eval=args.linear_evaluation, 
         num_classes=num_classes)
+    
+    print("loaded model", flush=True)
 
     if args.distributed_mode and dist_utils.is_dist_avail_and_initialized():
         # set the single device scope, otherwise DistributedDataParallel will
@@ -254,7 +257,9 @@ def main():
 
     n_iter = 0
 
-    log_dir = args.pretrained_model_dir
+    log_dir = os.path.joinq(args.pretrained_model_dir, args.experiment_name)
+
+    print(f"log_dir:{log_dir}", flush=True)
 
     for epoch_counter in tqdm(range(args.epochs), desc="Training Progress"):
         if dist_utils.is_dist_avail_and_initialized():

From 883d9c0465d3c289ab122e43722b063b14250f92 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:45:08 -0800
Subject: [PATCH 49/57] update

---
 evaluate_simCLR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 95029b8..0291e22 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -257,7 +257,7 @@ def main():
 
     n_iter = 0
 
-    log_dir = os.path.joinq(args.pretrained_model_dir, args.experiment_name)
+    log_dir = os.path.join(args.pretrained_model_dir, args.experiment_name)
 
     print(f"log_dir:{log_dir}", flush=True)
 

From c900b9fab5ab522f92e93b7d4c28beb0dc96af45 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:51:32 -0800
Subject: [PATCH 50/57] check evaluation

---
 evaluate_simCLR.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 0291e22..27f2416 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -261,11 +261,12 @@ def main():
 
     print(f"log_dir:{log_dir}", flush=True)
 
-    for epoch_counter in tqdm(range(args.epochs), desc="Training Progress"):
+    for epoch_counter in tqdm(range(args.epochs), desc="Epoch Progress"):
         if dist_utils.is_dist_avail_and_initialized():
             train_loader.sampler.set_epoch(epoch_counter)
         top1_train_accuracy = 0
-        for counter, (x_batch, y_batch) in enumerate(train_loader):
+        print(f"epoch:{epoch_counter}", flush=True)
+        for counter, (x_batch, y_batch) in tqdm(enumerate(train_loader), desc="Training Progress"):
             x_batch = x_batch.cuda(device_id)
             y_batch = y_batch.cuda(device_id)
 
@@ -278,11 +279,13 @@ def main():
             loss.backward()
             optimizer.step()
             n_iter += 1
+            if counter % 100 == 0:
+                print(f"Epoch {epoch_counter}\t Iteration {counter}\t Loss: {loss.item()}", flush=True)
 
         top1_train_accuracy /= counter + 1
         top1_accuracy = 0
         top5_accuracy = 0
-        for counter, (x_batch, y_batch) in enumerate(test_loader):
+        for counter, (x_batch, y_batch) in tqdm(enumerate(test_loader), desc="Evaluation Progress"):
             x_batch = x_batch.cuda(device_id)
             y_batch = y_batch.cuda(device_id)
 

From 314633cf1d3186c0fde8bb47be9e63e27262cfb9 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:58:13 -0800
Subject: [PATCH 51/57] Clean the code

---
 SimCLR/models/resnet_pretrained.py |  1 -
 evaluate_simCLR.py                 | 27 ++++++++++-----------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index ef088a0..840577a 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -10,7 +10,6 @@ def __init__(self, base_model, pretrained_model_file, linear_eval=True, num_clas
         super(PretrainedResNet, self).__init__()
 
         self.pretrained_model_file = pretrained_model_file
-        print(self.pretrained_model_file, flush=True)
 
         self.resnet_dict = {
             "resnet18": models.resnet18(pretrained=False, num_classes=num_classes),
diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 27f2416..49f424e 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -227,7 +227,6 @@ def main():
     elif args.dataset_name == "stl10":
         num_classes = 10
     elif args.dataset_name == "imagenet":
-        print("Using ImageNet dataset", flush=True)
         num_classes = 1000
 
     model = PretrainedResNet(
@@ -235,8 +234,6 @@ def main():
         pretrained_model_file = os.path.join(args.pretrained_model_dir, args.experiment_name, args.pretrained_model_name), 
         linear_eval=args.linear_evaluation, 
         num_classes=num_classes)
-    
-    print("loaded model", flush=True)
 
     if args.distributed_mode and dist_utils.is_dist_avail_and_initialized():
         # set the single device scope, otherwise DistributedDataParallel will
@@ -255,18 +252,14 @@ def main():
     
     criterion = torch.nn.CrossEntropyLoss().cuda(device_id)
 
-    n_iter = 0
-
     log_dir = os.path.join(args.pretrained_model_dir, args.experiment_name)
 
-    print(f"log_dir:{log_dir}", flush=True)
-
     for epoch_counter in tqdm(range(args.epochs), desc="Epoch Progress"):
         if dist_utils.is_dist_avail_and_initialized():
             train_loader.sampler.set_epoch(epoch_counter)
         top1_train_accuracy = 0
-        print(f"epoch:{epoch_counter}", flush=True)
-        for counter, (x_batch, y_batch) in tqdm(enumerate(train_loader), desc="Training Progress"):
+        counter = 0
+        for counter, (x_batch, y_batch) in tqdm(train_loader, desc="Training Progress"):
             x_batch = x_batch.cuda(device_id)
             y_batch = y_batch.cuda(device_id)
 
@@ -278,14 +271,13 @@ def main():
             optimizer.zero_grad()
             loss.backward()
             optimizer.step()
-            n_iter += 1
-            if counter % 100 == 0:
-                print(f"Epoch {epoch_counter}\t Iteration {counter}\t Loss: {loss.item()}", flush=True)
+            counter += 1
 
-        top1_train_accuracy /= counter + 1
+        top1_train_accuracy /= counter 
         top1_accuracy = 0
         top5_accuracy = 0
-        for counter, (x_batch, y_batch) in tqdm(enumerate(test_loader), desc="Evaluation Progress"):
+        counter = 0
+        for x_batch, y_batch in tqdm(test_loader, desc="Evaluation Progress"):
             x_batch = x_batch.cuda(device_id)
             y_batch = y_batch.cuda(device_id)
 
@@ -294,11 +286,12 @@ def main():
             top1, top5 = accuracy(logits, y_batch, topk=(1, 5))
             top1_accuracy += top1[0]
             top5_accuracy += top5[0]
+            counter += 1
 
-        top1_accuracy /= counter + 1
-        top5_accuracy /= counter + 1
+        top1_accuracy /= counter 
+        top5_accuracy /= counter 
         print(
-            f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
+            f"Epoch {epoch_counter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}",
             flush=True,
         )
         if args.enable_checkpointing:

From ed78e17d9d2b785aa884c4e6407d1afa16fedb0d Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 12:59:47 -0800
Subject: [PATCH 52/57] update

---
 evaluate_simCLR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py
index 49f424e..cc1cca3 100644
--- a/evaluate_simCLR.py
+++ b/evaluate_simCLR.py
@@ -259,7 +259,7 @@ def main():
             train_loader.sampler.set_epoch(epoch_counter)
         top1_train_accuracy = 0
         counter = 0
-        for counter, (x_batch, y_batch) in tqdm(train_loader, desc="Training Progress"):
+        for x_batch, y_batch in tqdm(train_loader, desc="Training Progress"):
             x_batch = x_batch.cuda(device_id)
             y_batch = y_batch.cuda(device_id)
 

From a3926674d2d77dc114baf1803332ccbdbdca4e57 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 13:10:11 -0800
Subject: [PATCH 53/57] try catch the file exist error

---
 run_simCLR.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/run_simCLR.py b/run_simCLR.py
index 3d74b29..cd8cebd 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -179,8 +179,10 @@ def main():
     now = datetime.now()
     dt_string = now.strftime("%Y_%m_%d_%H_%M")
     log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string)
-    if not os.path.exists(log_dir):
+    try:
         os.makedirs(log_dir)
+    except FileExistsError:
+        print(f"Directory {log_dir} made by another worker")
 
     # Set the start method to spawn for distributed training
     torch.multiprocessing.set_start_method("spawn")

From 6c5cf20240b64a3dd418b3e16132cc0f37273e37 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 13:10:25 -0800
Subject: [PATCH 54/57] update

---
 run_simCLR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_simCLR.py b/run_simCLR.py
index cd8cebd..c067f63 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -182,7 +182,7 @@ def main():
     try:
         os.makedirs(log_dir)
     except FileExistsError:
-        print(f"Directory {log_dir} made by another worker")
+        print(f"Directory {log_dir} made by another worker", flush=True)
 
     # Set the start method to spawn for distributed training
     torch.multiprocessing.set_start_method("spawn")

From 313b7055d39c7bb0ff305c97ac558545bcb92fd3 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Fri, 23 Feb 2024 13:13:48 -0800
Subject: [PATCH 55/57] update logging part

---
 SimCLR/simclr.py | 11 ++++++++++-
 run_simCLR.py    | 12 ------------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py
index 21a8652..e03ee60 100644
--- a/SimCLR/simclr.py
+++ b/SimCLR/simclr.py
@@ -1,4 +1,5 @@
 import os
+from datetime import datetime
 
 import torch
 from torch.cuda.amp import GradScaler, autocast
@@ -12,12 +13,20 @@
 
 
 class SimCLR(object):
-    def __init__(self, log_dir, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         self.args = kwargs["args"]
         self.model = kwargs["model"]
         self.optimizer = kwargs["optimizer"]
         self.scheduler = kwargs["scheduler"]
         self.device_id = kwargs["device_id"]
+        # Create a directory to save the model checkpoints and logs
+        now = datetime.now()
+        dt_string = now.strftime("%Y_%m_%d_%H_%M")
+        log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string)
+        try:
+            os.makedirs(log_dir)
+        except FileExistsError:
+            print(f"Directory {log_dir} made by another worker", flush=True)
         self.writer = SummaryWriter(log_dir)
         self.criterion = loss.SimCLRContrastiveLoss(self.args.temperature).cuda(
             self.device_id
diff --git a/run_simCLR.py b/run_simCLR.py
index c067f63..c307027 100644
--- a/run_simCLR.py
+++ b/run_simCLR.py
@@ -3,8 +3,6 @@
 import random
 from functools import partial
 
-import os
-from datetime import datetime
 import torch
 from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
 from torch.utils.data.distributed import DistributedSampler
@@ -175,15 +173,6 @@ def main():
     args = parser.parse_args()
     print(args)
 
-    # Create a directory to save the model checkpoints and logs
-    now = datetime.now()
-    dt_string = now.strftime("%Y_%m_%d_%H_%M")
-    log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string)
-    try:
-        os.makedirs(log_dir)
-    except FileExistsError:
-        print(f"Directory {log_dir} made by another worker", flush=True)
-
     # Set the start method to spawn for distributed training
     torch.multiprocessing.set_start_method("spawn")
 
@@ -263,7 +252,6 @@ def main():
         optimizer, T_max=len(train_loader), eta_min=0, last_epoch=-1
     )
     simclr = SimCLR(
-        log_dir=log_dir,
         model=model,
         optimizer=optimizer,
         scheduler=scheduler,

From 67b5a9f0d3be56cd42645528ff67703f556948d9 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Sun, 25 Feb 2024 10:05:19 -0800
Subject: [PATCH 56/57] update slrm scripts

---
 eval_simclr.slrm  | 14 ++++++--------
 train_simclr.slrm |  8 +++-----
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/eval_simclr.slrm b/eval_simclr.slrm
index e2e1d7a..6c4b627 100644
--- a/eval_simclr.slrm
+++ b/eval_simclr.slrm
@@ -1,15 +1,15 @@
 #!/bin/bash
 
 #SBATCH --job-name=train_sunrgbd
-#SBATCH --partition=a100
+#SBATCH --partition=t4v2
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:4
-#SBATCH --ntasks-per-node=1
+#SBATCH --ntasks-per-node=4
 #SBATCH --cpus-per-task=4
 #SBATCH --mem=100G
 #SBATCH --output=logs/simclr/eval_slurm-%N-%j.out
 #SBATCH --error=logs/simclr/eval_slurm-%N-%j.err
-#SBATCH --qos=a100_arashaf
+#SBATCH --qos=m
 
 PY_ARGS=${@:1}
 
@@ -39,15 +39,13 @@ for file in $files
 do
     echo "Evaluating: $file"
 
-    # torchrun execute nproc-per-node * nodes times
-    torchrun --nnodes 1 --nproc-per-node 4 evaluate_simCLR.py \
+    # srun execute ntasks-per-node * nodes times
+    srun python evaluate_simCLR.py \
     --distributed_mode \
-    --distributed_launcher="pytorch" \
     --batch-size=256 \
     --pretrained_model_dir=$pretrained_model_dir \
     --experiment_name=$experiment_name \
     --pretrained_model_name=$file \
-    --linear_evaluation \
-    --arch="resnet50"
+    --linear_evaluation 
     # Add your processing logic here
 done
\ No newline at end of file
diff --git a/train_simclr.slrm b/train_simclr.slrm
index b5d0acb..0a352eb 100644
--- a/train_simclr.slrm
+++ b/train_simclr.slrm
@@ -25,12 +25,10 @@ export MASTER_PORT=45679
 export PYTHONPATH="."
 nvidia-smi
 
-# torchrun execute nproc-per-node * nodes times
-torchrun --nnodes 1 --nproc-per-node 4 run_simCLR.py \
+# srun execute ntasks-per-node * nodes times
+srun pythong run_simCLR.py \
 --fp16-precision \
 --distributed_mode \
---distributed_launcher="pytorch" \
 --batch-size=256 \
 --model_dir="/projects/imagenet_synthetic/train_models" \
---experiment_name="simclr" \
---arch="resnet50"
\ No newline at end of file
+--experiment_name="simclr" 
\ No newline at end of file

From 1e2ba8142c309ca905b5e579db835a9c99c5f719 Mon Sep 17 00:00:00 2001
From: sanaAyrml <ayromlous@gmail.com>
Date: Tue, 27 Feb 2024 17:43:15 -0800
Subject: [PATCH 57/57] update resnet pretrained

---
 SimCLR/models/resnet_pretrained.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py
index 840577a..bc37e85 100644
--- a/SimCLR/models/resnet_pretrained.py
+++ b/SimCLR/models/resnet_pretrained.py
@@ -34,7 +34,7 @@ def _load_pretrained(self):
         state_dict = checkpoint["state_dict"]
         for k in list(state_dict.keys()):
             if k.startswith("module.backbone."):
-                if k.startswith("module.backbone") and not k.startswith("module.backbone.fc"):
+                if not k.startswith("module.backbone.fc"):
                     # remove prefix
                     state_dict[k[len("module.backbone.") :]] = state_dict[k]
             del state_dict[k]