From acb48e2b17a82d0296246a8ed9dee72c24eb4c6c Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 24 Jan 2024 10:54:10 -0800 Subject: [PATCH 01/57] Distributed rcdm --- .../data_aug/contrastive_learning_dataset.py | 18 +++++++------ run_simCLR.py | 25 +++++++++---------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index d8eb61d..5e9e709 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -13,7 +13,7 @@ def __init__(self, root_folder): self.root_folder = root_folder @staticmethod - def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True): + def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=None): """Return a set of data augmentation transformations as described in the SimCLR paper. Args: @@ -32,19 +32,21 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True): ] if rcdm_agumentation: rcdm_config = get_config() - transform_list.append(RCDMInference(rcdm_config)) + transform_list.append(RCDMInference(rcdm_config, device_id)) transform_list.append(transforms.Resize(size=(size, size))) + return transforms.Compose(transform_list) - def get_dataset(self, name, n_views, rcdm_agumentation=True): + def get_dataset(self, name, n_views, rcdm_agumentation=False, device_id=None): valid_datasets = { "cifar10": lambda: datasets.CIFAR10( self.root_folder, train=True, transform=ContrastiveLearningViewGenerator( self.get_simclr_pipeline_transform( - 32, rcdm_agumentation=rcdm_agumentation + 32, rcdm_agumentation=rcdm_agumentation, + device_id=device_id ), n_views, ), @@ -55,7 +57,8 @@ def get_dataset(self, name, n_views, rcdm_agumentation=True): split="unlabeled", transform=ContrastiveLearningViewGenerator( self.get_simclr_pipeline_transform( - 96, rcdm_agumentation=rcdm_agumentation + 96, rcdm_agumentation=rcdm_agumentation, + device_id=device_id ), n_views, ), @@ -66,7 +69,8 @@ def get_dataset(self, name, n_views, rcdm_agumentation=True): split="train", transform=ContrastiveLearningViewGenerator( self.get_simclr_pipeline_transform( - 224, rcdm_agumentation=rcdm_agumentation + 224, rcdm_agumentation=rcdm_agumentation, + device_id=device_id ), n_views, ), @@ -78,4 +82,4 @@ def get_dataset(self, name, n_views, rcdm_agumentation=True): except KeyError: raise InvalidDatasetSelection() else: - return dataset_fn() + return dataset_fn() \ No newline at end of file diff --git a/run_simCLR.py b/run_simCLR.py index f4d6b84..564416b 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -110,7 +110,6 @@ parser.add_argument( "--rcdm_agumentation", action="store_true", help="Use RCDM agumentation or not." ) - parser.add_argument( "--distributed_mode", action="store_true", help="Enable distributed training" ) @@ -145,30 +144,30 @@ def main(): args.n_views == 2 ), "Only two view training is supported. Please use --n-views 2." + if args.distributed_mode: + dist_utils.init_distributed_mode( + launcher=args.distributed_launcher, + backend=args.distributed_backend, + ) + device_id = torch.cuda.current_device() + else: + device_id = None + dataset = ContrastiveLearningDataset(args.data) train_dataset = dataset.get_dataset( args.dataset_name, args.n_views, args.rcdm_agumentation, + device_id ) - train_sampler = None - if args.distributed_mode: - dist_utils.init_distributed_mode( - launcher=args.distributed_launcher, - backend=args.distributed_backend, - ) - device_id = torch.cuda.current_device() - if dist_utils.is_dist_avail_and_initialized(): + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: train_sampler = DistributedSampler( train_dataset, seed=args.seed, drop_last=True, ) - else: - device_id = None - init_fn = partial( worker_init_fn, num_workers=args.num_workers, @@ -218,4 +217,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file From cdec0c9ab3e7ac414c63842b59bed0a2c661b59b Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 25 Jan 2024 15:27:12 -0800 Subject: [PATCH 02/57] Add rcdm model --- .../data_aug/contrastive_learning_dataset.py | 36 ++++++++++++------- SimCLR/data_aug/rcdm_aug.py | 14 ++++---- SimCLR/data_aug/rcdm_config.py | 7 ++-- rcdm/guided_diffusion_rcdm/get_rcdm_models.py | 20 +++++------ rcdm/guided_diffusion_rcdm/get_ssl_models.py | 10 +++--- run_simCLR.py | 7 ++-- 6 files changed, 55 insertions(+), 39 deletions(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index 5e9e709..71552b0 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -6,7 +6,7 @@ from SimCLR.data_aug.rcdm_config import get_config from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator from SimCLR.exceptions.exceptions import InvalidDatasetSelection - +import random class ContrastiveLearningDataset: def __init__(self, root_folder): @@ -21,19 +21,29 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N s (float, optional): Magnitude of the color distortion. Defaults to 1. rcdm_agumentation (bool, optional): Whether to use RCDM augmentation. Defaults to True. """ - color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s) - transform_list = [ - transforms.RandomResizedCrop(size=size), - transforms.RandomHorizontalFlip(), - transforms.RandomApply([color_jitter], p=0.8), - transforms.RandomGrayscale(p=0.2), - GaussianBlur(kernel_size=int(0.1 * size)), - transforms.ToTensor(), - ] - if rcdm_agumentation: + prob = random.uniform(0, 1) + if prob < 0.5 and rcdm_agumentation: rcdm_config = get_config() - transform_list.append(RCDMInference(rcdm_config, device_id)) - transform_list.append(transforms.Resize(size=(size, size))) + timestep_respacing = ["ddim10", "ddim25", "ddim50"] + rcdm_config.timestep_respacing = timestep_respacing[random.randrange(len(timestep_respacing))] + transform_list = [ + transforms.Resize(size=(size,size)), + transforms.ToTensor(), + RCDMInference(rcdm_config, device_id), + transforms.RandomHorizontalFlip(), + transforms.Resize(size=(size,size)), + ] + else: + color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s) + transform_list = [ + transforms.RandomResizedCrop(size=size), + transforms.RandomHorizontalFlip(), + transforms.RandomApply([color_jitter], p=0.8), + transforms.RandomGrayscale(p=0.2), + GaussianBlur(kernel_size=int(0.1 * size)), + transforms.ToTensor(), + ] + return transforms.Compose(transform_list) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index fa4566d..2d917df 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -1,5 +1,4 @@ import torch -from rcdm.guided_diffusion_rcdm import dist_util from rcdm.guided_diffusion_rcdm.get_ssl_models import get_model from rcdm.guided_diffusion_rcdm.get_rcdm_models import get_dict_rcdm_model from rcdm.guided_diffusion_rcdm.script_util import ( @@ -9,14 +8,15 @@ class RCDMInference(object): - def __init__(self, config): + def __init__(self, config, device_id): """ Initialize the RCDMInference class with necessary parameters and load models. """ self.config = config + self.device_id = device_id # Load SSL model - self.ssl_model = get_model(self.config.type_model, self.config.use_head).cuda().eval() + self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.model_dir).cuda(self.device_id).eval() for p in self.ssl_model.parameters(): p.requires_grad = False @@ -29,11 +29,11 @@ def __init__(self, config): ) if self.config.model_path == "": - trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head) + trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.model_dir) else: trained_model = torch.load(self.config.model_path, map_location="cpu") self.model.load_state_dict(trained_model, strict=True) - self.model.to(dist_util.dev()) + self.model.cuda(self.device_id) def __call__(self, img): """ @@ -49,7 +49,7 @@ def __call__(self, img): sample_fn = self.diffusion.p_sample_loop if not self.config.use_ddim else self.diffusion.ddim_sample_loop - img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda() + img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda(self.device_id) model_kwargs = {} with torch.no_grad(): @@ -63,4 +63,4 @@ def __call__(self, img): ) print("Sampling completed!") - return sample.squeeze(0).cpu() \ No newline at end of file + return sample.detach().squeeze(0) \ No newline at end of file diff --git a/SimCLR/data_aug/rcdm_config.py b/SimCLR/data_aug/rcdm_config.py index ad7d3b5..4af056f 100644 --- a/SimCLR/data_aug/rcdm_config.py +++ b/SimCLR/data_aug/rcdm_config.py @@ -4,14 +4,15 @@ def get_config(): config = ml_collections.ConfigDict() config.image_size = 128 # The size of the images to generate. config.class_cond = False # If true, use class conditional generation. + config.model_dir = "/ssd003/projects/aieng/genssl" # Path to the directory containing the model. config.type_model = "simclr" # Type of model to use (e.g., simclr, dino). config.use_head = False # If true, use the projector/head for SSL representation. config.model_path = "" # Replace with the path to your model if you have one. - config.use_ddim = False # If true, use DDIM sampler. + config.use_ddim = True # If true, use DDIM sampler. config.no_shared = True # If false, enables squeeze and excitation. config.clip_denoised = True # If true, clip denoised images. config.attention_resolutions = "32,16,8" # Resolutions to use for attention layers. - config.diffusion_steps = 100 # Number of diffusion steps. + config.diffusion_steps = 1000 # Number of diffusion steps. config.learn_sigma = True # If true, learn the noise level. config.noise_schedule = "linear" # Type of noise schedule (e.g., linear). config.num_channels = 256 # Number of channels in the model. @@ -22,5 +23,7 @@ def get_config(): config.use_scale_shift_norm = True # If true, use scale-shift normalization. config.ssl_image_size = 224 # Size of the input images for the SSL model. config.ssl_image_channels = 3 # Number of channels of the input images for the SSL model. + config.num_images = 1 # Number of images to generate. + config.timestep_respacing = "ddim10" # Type of timestep respacing (e.g., ddim25). return config \ No newline at end of file diff --git a/rcdm/guided_diffusion_rcdm/get_rcdm_models.py b/rcdm/guided_diffusion_rcdm/get_rcdm_models.py index 326ba6c..83ddcfb 100644 --- a/rcdm/guided_diffusion_rcdm/get_rcdm_models.py +++ b/rcdm/guided_diffusion_rcdm/get_rcdm_models.py @@ -4,41 +4,41 @@ import torch.nn as nn from torchvision import models as torchvision_models -def get_dict_rcdm_model(model="dino", use_head=False): +def get_dict_rcdm_model(model="dino", use_head=False, model_dir='./'): ''' Download checkpoints of RCDM. ''' if model == "supervised": - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_supervised.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_supervised.pt", map_location="cpu", model_dir=model_dir) return trained_model elif model == "simclr": if use_head: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_head.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_head.pt", map_location="cpu", model_dir=model_dir) else: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_trunk.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_simclr_trunk.pt", map_location="cpu", model_dir=model_dir) return trained_model elif model == "barlow": if use_head: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_head.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_head.pt", map_location="cpu", model_dir=model_dir) else: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_trunk.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_barlow_trunk.pt", map_location="cpu", model_dir=model_dir) return trained_model elif model == "vicreg": if use_head: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_head.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_head.pt", map_location="cpu", model_dir=model_dir) else: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_trunk.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_vicreg_trunk.pt", map_location="cpu", model_dir=model_dir) return trained_model elif model == "dino": if use_head: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_head.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_head.pt", map_location="cpu", model_dir=model_dir) else: - trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_trunk.pt", map_location="cpu") + trained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/rcdm/rcdm_ema_dino_trunk.pt", map_location="cpu", model_dir=model_dir) return trained_model else: diff --git a/rcdm/guided_diffusion_rcdm/get_ssl_models.py b/rcdm/guided_diffusion_rcdm/get_ssl_models.py index db5bd60..b8e3978 100644 --- a/rcdm/guided_diffusion_rcdm/get_ssl_models.py +++ b/rcdm/guided_diffusion_rcdm/get_ssl_models.py @@ -89,7 +89,7 @@ def forward(self, x): x = torch.nn.functional.normalize(x, dim=-1, p=2).detach() return x -def get_model(model="dino", use_head=False): +def get_model(model="dino", use_head=False, model_dir='./'): ''' Select a model that will be used to compute the embeddings needed by RCDM. You can use any kind of model, ConvNets/MLPs, or VITs. @@ -109,7 +109,7 @@ def get_model(model="dino", use_head=False): nlayers=2, use_bn=True, ) - pretrained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain_full_checkpoint.pth", map_location="cpu") + pretrained_model = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dino/dino_resnet50_pretrain/dino_resnet50_pretrain_full_checkpoint.pth", map_location="cpu", model_dir=model_dir) pretrained_model = pretrained_model["teacher"] if "state_dict" in pretrained_model: pretrained_model = pretrained_model["state_dict"] @@ -130,7 +130,7 @@ def get_model(model="dino", use_head=False): elif model == "simclr": embedding_model = torchvision_models.resnet50() embedding_model.fc = nn.Identity() - pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/simclr_rn50_1000ep_simclr_8node_resnet_16_07_20.afe428c7/model_final_checkpoint_phase999.torch", map_location="cpu") + pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/simclr_rn50_1000ep_simclr_8node_resnet_16_07_20.afe428c7/model_final_checkpoint_phase999.torch", map_location="cpu", model_dir=model_dir) # Load trunk pretrained_model = pretrained_model_base["classy_state_dict"]["base_model"]["model"]["trunk"] pretrained_model = {k.replace("_feature_blocks.", ""): v for k, v in pretrained_model.items()} @@ -149,7 +149,7 @@ def get_model(model="dino", use_head=False): elif model == "barlow": embedding_model = torchvision_models.resnet50() embedding_model.fc = nn.Identity() - pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/barlow_twins/barlow_twins_32gpus_4node_imagenet1k_1000ep_resnet50.torch", map_location="cpu") + pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vissl/model_zoo/barlow_twins/barlow_twins_32gpus_4node_imagenet1k_1000ep_resnet50.torch", map_location="cpu", model_dir=model_dir) # Load trunk pretrained_model = pretrained_model_base["classy_state_dict"]["base_model"]["model"]["trunk"] pretrained_model = {k.replace("_feature_blocks.", ""): v for k, v in pretrained_model.items()} @@ -169,7 +169,7 @@ def get_model(model="dino", use_head=False): elif model == "vicreg": embedding_model = torchvision_models.resnet50() embedding_model.fc = nn.Identity() - pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vicreg/resnet50_fullckpt.pth", map_location="cpu") + pretrained_model_base = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/vicreg/resnet50_fullckpt.pth", map_location="cpu", model_dir=model_dir) embedding_model.classifier = nn.Identity() embedding_model.projector = Projector(emb=8192) pretrained = "resnet50_fullckpt.pth" diff --git a/run_simCLR.py b/run_simCLR.py index 564416b..4ce3f74 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -140,9 +140,11 @@ def main(): args = parser.parse_args() print(args) + torch.multiprocessing.set_start_method("spawn") + assert ( args.n_views == 2 - ), "Only two view training is supported. Please use --n-views 2." + ), "Only two view training is supported. Please use --n-views 2." if args.distributed_mode: dist_utils.init_distributed_mode( @@ -168,6 +170,7 @@ def main(): seed=args.seed, drop_last=True, ) + init_fn = partial( worker_init_fn, num_workers=args.num_workers, @@ -181,7 +184,7 @@ def main(): sampler=train_sampler, num_workers=args.num_workers, worker_init_fn=init_fn, - pin_memory=True, + pin_memory=False, drop_last=True, ) From 23cdf804c07178845fa7183bb6a3c6cac322c985 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 25 Jan 2024 17:31:28 -0800 Subject: [PATCH 03/57] Add checkpointing --- SimCLR/simclr.py | 32 ++++++++++++++++++-------------- run_simCLR.py | 2 ++ 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index 661fdf2..8a06401 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -18,7 +18,10 @@ def __init__(self, *args, **kwargs): self.optimizer = kwargs["optimizer"] self.scheduler = kwargs["scheduler"] self.device_id = kwargs["device_id"] - self.writer = SummaryWriter() + log_dir = os.path.join(self.args.model_dir, self.args.experiment_name) + if not os.isdir(log_dir): + os.mkdir(log_dir) + self.writer = SummaryWriter(log_dir=log_dir) self.criterion = torch.nn.CrossEntropyLoss().cuda(self.device_id) def simclr_logits_and_labels(self, features): @@ -91,7 +94,7 @@ def train(self, train_loader): if n_iter % self.args.log_every_n_steps == 0: print( - f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss}", + f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss},acc: {accuracy(logits, labels, topk=(1, 5))}", ) top1, top5 = accuracy(logits, labels, topk=(1, 5)) self.writer.add_scalar("loss", loss, global_step=n_iter) @@ -102,6 +105,18 @@ def train(self, train_loader): self.scheduler.get_last_lr()[0], global_step=n_iter, ) + # save model checkpoints + checkpoint_name = "checkpoint_{:04d}.pth.tar".format(self.args.epochs) + save_checkpoint( + { + "n_iter": n_iter, + "arch": self.args.arch, + "state_dict": self.model.state_dict(), + "optimizer": self.optimizer.state_dict(), + }, + is_best=False, + filename=os.path.join(self.writer.log_dir, checkpoint_name), + ) n_iter += 1 @@ -112,16 +127,5 @@ def train(self, train_loader): print(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}") print("Training has finished.") - # save model checkpoints - checkpoint_name = "checkpoint_{:04d}.pth.tar".format(self.args.epochs) - save_checkpoint( - { - "epoch": self.args.epochs, - "arch": self.args.arch, - "state_dict": self.model.state_dict(), - "optimizer": self.optimizer.state_dict(), - }, - is_best=False, - filename=os.path.join(self.writer.log_dir, checkpoint_name), - ) + print(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.") diff --git a/run_simCLR.py b/run_simCLR.py index 4ce3f74..3cb75d6 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -115,6 +115,8 @@ ) parser.add_argument("--distributed_launcher", default="slurm") parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument("--model_dir", default="model_checkpoints") +parser.add_argument("--experiment_name", default="simclr") def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: From ae427371edbdc4a9e3f40aebc2587f4970f57fc5 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 25 Jan 2024 17:36:58 -0800 Subject: [PATCH 04/57] update --- SimCLR/data_aug/rcdm_aug.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index 2d917df..5cb22dd 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -16,7 +16,7 @@ def __init__(self, config, device_id): self.device_id = device_id # Load SSL model - self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.model_dir).cuda(self.device_id).eval() + self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval() for p in self.ssl_model.parameters(): p.requires_grad = False @@ -29,7 +29,7 @@ def __init__(self, config, device_id): ) if self.config.model_path == "": - trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.model_dir) + trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.config.model_dir) else: trained_model = torch.load(self.config.model_path, map_location="cpu") self.model.load_state_dict(trained_model, strict=True) From 73c580b7f04635edd9b83938617d177ba42c68c6 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 25 Jan 2024 17:41:41 -0800 Subject: [PATCH 05/57] update --- SimCLR/simclr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index 8a06401..518fe8b 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs): self.scheduler = kwargs["scheduler"] self.device_id = kwargs["device_id"] log_dir = os.path.join(self.args.model_dir, self.args.experiment_name) - if not os.isdir(log_dir): + if not os.path.exists(log_dir): os.mkdir(log_dir) self.writer = SummaryWriter(log_dir=log_dir) self.criterion = torch.nn.CrossEntropyLoss().cuda(self.device_id) From 25936dca4dc8db5fe0a8ef8b7ed025e51b74485e Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 25 Jan 2024 17:52:56 -0800 Subject: [PATCH 06/57] update config --- SimCLR/data_aug/contrastive_learning_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index 71552b0..9fbdc87 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -24,7 +24,7 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N prob = random.uniform(0, 1) if prob < 0.5 and rcdm_agumentation: rcdm_config = get_config() - timestep_respacing = ["ddim10", "ddim25", "ddim50"] + timestep_respacing = ["ddim10", "ddim25"] rcdm_config.timestep_respacing = timestep_respacing[random.randrange(len(timestep_respacing))] transform_list = [ transforms.Resize(size=(size,size)), From 2d9d1971382c409bcb6b49556ffc8c32aab69517 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 25 Jan 2024 17:59:41 -0800 Subject: [PATCH 07/57] edit logging --- SimCLR/simclr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index 518fe8b..5822cd8 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -93,8 +93,9 @@ def train(self, train_loader): scaler.update() if n_iter % self.args.log_every_n_steps == 0: + top1, top5 = accuracy(logits, labels, topk=(1, 5)) print( - f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss},acc: {accuracy(logits, labels, topk=(1, 5))}", + f"Calculating accuracy/loss at iteration: {n_iter}, loss: {loss},acc: top1 - {top1[0]}, top5 - {top5[0]}", ) top1, top5 = accuracy(logits, labels, topk=(1, 5)) self.writer.add_scalar("loss", loss, global_step=n_iter) From 422a10dc29719886f41807795f63b32f907b4613 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 30 Jan 2024 13:54:22 -0500 Subject: [PATCH 08/57] server --- .../data_aug/contrastive_learning_dataset.py | 4 +- SimCLR/data_aug/rcdm_aug.py | 128 +++++++++--------- SimCLR/data_aug/rcdm_config.py | 56 ++++---- 3 files changed, 92 insertions(+), 96 deletions(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index 9fbdc87..dd786ce 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -22,10 +22,8 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N rcdm_agumentation (bool, optional): Whether to use RCDM augmentation. Defaults to True. """ prob = random.uniform(0, 1) - if prob < 0.5 and rcdm_agumentation: + if prob < 1 and rcdm_agumentation: rcdm_config = get_config() - timestep_respacing = ["ddim10", "ddim25"] - rcdm_config.timestep_respacing = timestep_respacing[random.randrange(len(timestep_respacing))] transform_list = [ transforms.Resize(size=(size,size)), transforms.ToTensor(), diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index 5cb22dd..8ea9e23 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -1,66 +1,64 @@ -import torch -from rcdm.guided_diffusion_rcdm.get_ssl_models import get_model -from rcdm.guided_diffusion_rcdm.get_rcdm_models import get_dict_rcdm_model -from rcdm.guided_diffusion_rcdm.script_util import ( - model_and_diffusion_defaults, - create_model_and_diffusion -) - - -class RCDMInference(object): - def __init__(self, config, device_id): - """ - Initialize the RCDMInference class with necessary parameters and load models. - """ - self.config = config - self.device_id = device_id - - # Load SSL model - self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval() - for p in self.ssl_model.parameters(): - p.requires_grad = False - - # Load RCDM model - model_defaults = model_and_diffusion_defaults() - model_args = {k: getattr(self.config, k, model_defaults[k]) for k in model_defaults} - - self.model, self.diffusion = create_model_and_diffusion( - **model_args, G_shared=self.config.no_shared, feat_cond=True, ssl_dim=self.ssl_model(torch.zeros(1, config.ssl_image_channels, config.ssl_image_size, config.ssl_image_size).cuda()).size(1) - ) - - if self.config.model_path == "": - trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.config.model_dir) - else: - trained_model = torch.load(self.config.model_path, map_location="cpu") - self.model.load_state_dict(trained_model, strict=True) - self.model.cuda(self.device_id) - - def __call__(self, img): - """ - Run the RCDM model inference on an images. - - Args: - img (torch.Tensor): An image to apply RCDM to. - - Returns: - List[torch.Tensor]: List of generated image tensors. - """ - print("Starting RCDM model inference...") - - sample_fn = self.diffusion.p_sample_loop if not self.config.use_ddim else self.diffusion.ddim_sample_loop - - img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda(self.device_id) - model_kwargs = {} - - with torch.no_grad(): - feat = self.ssl_model(img).detach() - model_kwargs["feat"] = feat - sample = sample_fn( - self.model, - (1, 3, self.config.image_size, self.config.image_size), - clip_denoised=self.config.clip_denoised, - model_kwargs=model_kwargs, - ) - - print("Sampling completed!") +import torch +from rcdm.guided_diffusion_rcdm.get_ssl_models import get_model +from rcdm.guided_diffusion_rcdm.get_rcdm_models import get_dict_rcdm_model +from rcdm.guided_diffusion_rcdm.script_util import ( + model_and_diffusion_defaults, + create_model_and_diffusion +) + + +class RCDMInference(object): + def __init__(self, config, device_id): + """ + Initialize the RCDMInference class with necessary parameters and load models. + """ + self.config = config + self.device_id = device_id + + # Load SSL model + self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval() + for p in self.ssl_model.parameters(): + p.requires_grad = False + + # Load RCDM model + model_defaults = model_and_diffusion_defaults() + model_args = {k: getattr(self.config, k, model_defaults[k]) for k in model_defaults} + + self.model, self.diffusion = create_model_and_diffusion( + **model_args, G_shared=self.config.no_shared, feat_cond=True, ssl_dim=self.ssl_model(torch.zeros(1, config.ssl_image_channels, config.ssl_image_size, config.ssl_image_size).cuda()).size(1) + ) + + if self.config.model_path == "": + trained_model = get_dict_rcdm_model(self.config.type_model, self.config.use_head, self.config.model_dir) + else: + trained_model = torch.load(self.config.model_path, map_location="cpu") + self.model.load_state_dict(trained_model, strict=True) + self.model.cuda(self.device_id) + + def __call__(self, img): + """ + Run the RCDM model inference on an images. + + Args: + img (torch.Tensor): An image to apply RCDM to. + + Returns: + List[torch.Tensor]: List of generated image tensors. + """ + + sample_fn = self.diffusion.p_sample_loop if not self.config.use_ddim else self.diffusion.ddim_sample_loop + + img = img.unsqueeze(0).repeat(1, 1, 1, 1).cuda(self.device_id) + model_kwargs = {} + + with torch.no_grad(): + feat = self.ssl_model(img).detach() + model_kwargs["feat"] = feat + sample = sample_fn( + self.model, + (1, 3, self.config.image_size, self.config.image_size), + clip_denoised=self.config.clip_denoised, + model_kwargs=model_kwargs, + ) + return sample.detach().squeeze(0) \ No newline at end of file diff --git a/SimCLR/data_aug/rcdm_config.py b/SimCLR/data_aug/rcdm_config.py index 4af056f..0525a1a 100644 --- a/SimCLR/data_aug/rcdm_config.py +++ b/SimCLR/data_aug/rcdm_config.py @@ -1,29 +1,29 @@ -import ml_collections - -def get_config(): - config = ml_collections.ConfigDict() - config.image_size = 128 # The size of the images to generate. - config.class_cond = False # If true, use class conditional generation. - config.model_dir = "/ssd003/projects/aieng/genssl" # Path to the directory containing the model. - config.type_model = "simclr" # Type of model to use (e.g., simclr, dino). - config.use_head = False # If true, use the projector/head for SSL representation. - config.model_path = "" # Replace with the path to your model if you have one. - config.use_ddim = True # If true, use DDIM sampler. - config.no_shared = True # If false, enables squeeze and excitation. - config.clip_denoised = True # If true, clip denoised images. - config.attention_resolutions = "32,16,8" # Resolutions to use for attention layers. - config.diffusion_steps = 1000 # Number of diffusion steps. - config.learn_sigma = True # If true, learn the noise level. - config.noise_schedule = "linear" # Type of noise schedule (e.g., linear). - config.num_channels = 256 # Number of channels in the model. - config.num_heads = 4 # Number of attention heads. - config.num_res_blocks = 2 # Number of residual blocks. - config.resblock_updown = True # If true, use up/down sampling in resblocks. - config.use_fp16 = False # If true, use 16-bit floating point precision. - config.use_scale_shift_norm = True # If true, use scale-shift normalization. - config.ssl_image_size = 224 # Size of the input images for the SSL model. - config.ssl_image_channels = 3 # Number of channels of the input images for the SSL model. - config.num_images = 1 # Number of images to generate. - config.timestep_respacing = "ddim10" # Type of timestep respacing (e.g., ddim25). - +import ml_collections + +def get_config(): + config = ml_collections.ConfigDict() + config.image_size = 128 # The size of the images to generate. + config.class_cond = False # If true, use class conditional generation. + config.model_dir = "/ssd003/projects/aieng/genssl" # Path to the directory containing the model. + config.type_model = "simclr" # Type of model to use (e.g., simclr, dino). + config.use_head = False # If true, use the projector/head for SSL representation. + config.model_path = "" # Replace with the path to your model if you have one. + config.use_ddim = True # If true, use DDIM sampler. + config.no_shared = True # If false, enables squeeze and excitation. + config.clip_denoised = True # If true, clip denoised images. + config.attention_resolutions = "32,16,8" # Resolutions to use for attention layers. + config.diffusion_steps = 1000 # Number of diffusion steps. + config.learn_sigma = True # If true, learn the noise level. + config.noise_schedule = "linear" # Type of noise schedule (e.g., linear). + config.num_channels = 256 # Number of channels in the model. + config.num_heads = 4 # Number of attention heads. + config.num_res_blocks = 2 # Number of residual blocks. + config.resblock_updown = True # If true, use up/down sampling in resblocks. + config.use_fp16 = False # If true, use 16-bit floating point precision. + config.use_scale_shift_norm = True # If true, use scale-shift normalization. + config.ssl_image_size = 224 # Size of the input images for the SSL model. + config.ssl_image_channels = 3 # Number of channels of the input images for the SSL model. + config.num_images = 1 # Number of images to generate. + config.timestep_respacing = "ddim2" # Type of timestep respacing (e.g., ddim25). + return config \ No newline at end of file From 60d3dc3666662b436c01d301e411c266e819e7a9 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 30 Jan 2024 11:02:56 -0800 Subject: [PATCH 09/57] add checkpointing --- SimCLR/data_aug/contrastive_learning_dataset.py | 2 +- SimCLR/simclr.py | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index dd786ce..c45de87 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -22,7 +22,7 @@ def get_simclr_pipeline_transform(size, s=1, rcdm_agumentation=True, device_id=N rcdm_agumentation (bool, optional): Whether to use RCDM augmentation. Defaults to True. """ prob = random.uniform(0, 1) - if prob < 1 and rcdm_agumentation: + if prob < 0.5 and rcdm_agumentation: rcdm_config = get_config() transform_list = [ transforms.Resize(size=(size,size)), diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index 5822cd8..129d146 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -106,8 +106,9 @@ def train(self, train_loader): self.scheduler.get_last_lr()[0], global_step=n_iter, ) - # save model checkpoints - checkpoint_name = "checkpoint_{:04d}.pth.tar".format(self.args.epochs) + + # save latest model checkpoints + checkpoint_name = "checkpoint_final.pth.tar" save_checkpoint( { "n_iter": n_iter, @@ -126,6 +127,18 @@ def train(self, train_loader): self.scheduler.step() print(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}") + # save model checkpoints after epochs + checkpoint_name = "checkpoint_epoch_{:04d}.pth.tar".format(epoch_counter) + save_checkpoint( + { + "n_iter": n_iter, + "arch": self.args.arch, + "state_dict": self.model.state_dict(), + "optimizer": self.optimizer.state_dict(), + }, + is_best=False, + filename=os.path.join(self.writer.log_dir, checkpoint_name), + ) print("Training has finished.") From 9ae6ee87e5c309c6fa53b471b908ddc30866f74c Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 16:56:56 -0800 Subject: [PATCH 10/57] Add eval files --- SimCLR/data_aug/supervised_dataset.py | 56 ++++++ SimCLR/evaluate_simCLR.py | 242 ++++++++++++++++++++++++++ SimCLR/models/resnet_pretrained.py | 63 +++++++ 3 files changed, 361 insertions(+) create mode 100644 SimCLR/data_aug/supervised_dataset.py create mode 100644 SimCLR/evaluate_simCLR.py create mode 100644 SimCLR/models/resnet_pretrained.py diff --git a/SimCLR/data_aug/supervised_dataset.py b/SimCLR/data_aug/supervised_dataset.py new file mode 100644 index 0000000..6b1713a --- /dev/null +++ b/SimCLR/data_aug/supervised_dataset.py @@ -0,0 +1,56 @@ +from torchvision import datasets, transforms +from torchvision.transforms import transforms + +from SimCLR.exceptions.exceptions import InvalidDatasetSelection +import random + +class SupervisedDataset: + def __init__(self, root_folder): + self.root_folder = root_folder + + @staticmethod + def get_transform(size): + """Return a set of simple transformations for supervised learning. + + Args: + size (int): Image size. + """ + transform_list = [ + transforms.Resize(size=(size,size)), + transforms.ToTensor(), + ] + + return transforms.Compose(transform_list) + + + def get_dataset(self, name, train = True): + if name == "imagenet": + if train: + split = "train" + else: + split = "val" + return datasets.ImageNet( + self.root_folder, + split=split, + transform=self.get_transform(224), + ) + elif name == "cifar10": + return datasets.CIFAR10( + self.root_folder, + train=train, + transform= self.get_transform(32), + download=True, + ) + elif name == "stl10": + if train: + split = "train" + else: + split = "test" + return datasets.STL10( + self.root_folder, + split=split, + transform=self.get_transform(96), + download=True, + ) + else: + raise InvalidDatasetSelection() \ No newline at end of file diff --git a/SimCLR/evaluate_simCLR.py b/SimCLR/evaluate_simCLR.py new file mode 100644 index 0000000..5910c28 --- /dev/null +++ b/SimCLR/evaluate_simCLR.py @@ -0,0 +1,242 @@ +import argparse +import random +from functools import partial + +import torch +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torch.utils.data.distributed import DistributedSampler +from torchvision import models + +from SimCLR import distributed as dist_utils +from SimCLR.data_aug.supervised_dataset import SupervisedDataset +from SimCLR.models.resnet_pretrained import PretrainedResNet +from SimCLR.simclr import SimCLR + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch SimCLR") +parser.add_argument( + "-data", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset, for imagenet: /scratch/ssd004/datasets/imagenet256 ", +) +parser.add_argument( + "-dataset-name", + default="imagenet", + help="dataset-name", + choices=["stl10", "cifar10", "imagenet"], +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet18", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet18)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers", +) +parser.add_argument( + "--epochs", default=100, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=64, + type=int, + metavar="N", + help="mini-batch size (default: 256), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) + +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) + +parser.add_argument( + "--log-every-n-steps", default=100, type=int, help="Log every n steps" +) +parser.add_argument( + "--distributed_mode", action="store_true", help="Enable distributed training" +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument("--model_dir", default="model_checkpoints") +parser.add_argument("--experiment_name", default="simclr") +parser.add_argument("--linear_evaluation", + action="store_true", + help="Whether or not to evaluate the linear evaluation of the model.") + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def main(): + args = parser.parse_args() + print(args) + + torch.multiprocessing.set_start_method("spawn") + + assert ( + args.n_views == 2 + ), "Only two view training is supported. Please use --n-views 2." + + if args.distributed_mode: + dist_utils.init_distributed_mode( + launcher=args.distributed_launcher, + backend=args.distributed_backend, + ) + device_id = torch.cuda.current_device() + else: + device_id = None + + dataset = SupervisedDataset(args.data) + train_dataset = dataset.get_dataset( + name = args.dataset_name, + train=True, + ) + test_dataset = dataset.get_dataset( + name = args.dataset_name, + train=False, + ) + train_sampler = None + test_sampler = None + + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = DistributedSampler( + train_dataset, + seed=args.seed, + drop_last=True, + ) + test_sampler = DistributedSampler( + test_dataset, + seed=args.seed, + drop_last=False, + ) + + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=False, + drop_last=True, + ) + test_loader = torch.utils.data.DataLoader( + test_dataset, + batch_size=args.batch_size, + shuffle=(test_sampler is None), + sampler=test_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=False, + drop_last=False, + ) + + model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.model_dir, linear_eval=args.linear_evaluation) + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + # set the single device scope, otherwise DistributedDataParallel will + # use all available devices + torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + model = model.cuda() + + optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0008) + criterion = torch.nn.CrossEntropyLoss().cuda(device_id) + + for epoch in range(args.epochs): + if dist_utils.is_dist_avail_and_initialized(): + train_loader.sampler.set_epoch(epoch) + top1_train_accuracy = 0 + for counter, (x_batch, y_batch) in enumerate(train_loader): + x_batch = x_batch.cuda(device_id) + y_batch = y_batch.cuda(device_id) + + logits = model(x_batch) + loss = criterion(logits, y_batch) + top1 = accuracy(logits, y_batch, topk=(1,)) + top1_train_accuracy += top1[0] + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + top1_train_accuracy /= counter + 1 + top1_accuracy = 0 + top5_accuracy = 0 + for counter, (x_batch, y_batch) in enumerate(test_loader): + x_batch = x_batch.cuda(device_id) + y_batch = y_batch.cuda(device_id) + + logits = model(x_batch) + + top1, top5 = accuracy(logits, y_batch, topk=(1, 5)) + top1_accuracy += top1[0] + top5_accuracy += top5[0] + + top1_accuracy /= counter + 1 + top5_accuracy /= counter + 1 + print( + f"Epoch {epoch}\tTop1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", + ) + + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py new file mode 100644 index 0000000..97f1f80 --- /dev/null +++ b/SimCLR/models/resnet_pretrained.py @@ -0,0 +1,63 @@ +import torch +from torch import nn +from torchvision import models + +from ..exceptions.exceptions import InvalidBackboneError + + +class PretrainedResNet(nn.Module): + def __init__(self, base_model, pretrained_dir, linear_eval=True): + super(PretrainedResNet, self).__init__() + + self.pretrained_dir = pretrained_dir + + self.resnet_dict = { + "resnet18": models.resnet18(pretrained=False, num_classes=10), + "resnet50": models.resnet50(pretrained=False, num_classes=10), + } + + self.backbone = self._get_basemodel(base_model) + + # load pretrained weights + log = self._load_pretrained() + assert log.missing_keys == ["fc.weight", "fc.bias"] + + if linear_eval: + # freeze all layers but the last fc + self._freeze_backbone() + parameters = list(filter(lambda p: p.requires_grad, self.backbone.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + def _load_pretrained(self): + checkpoint = torch.load(self.pretrained_dir, map_location='cpu') + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + if k.startswith("backbone."): + if k.startswith("backbone") and not k.startswith("backbone.fc"): + # remove prefix + state_dict[k[len("backbone.") :]] = state_dict[k] + del state_dict[k] + log = self.backbone.load_state_dict(state_dict, strict=False) + return log + + + def _freeze_backbone(self): + # freeze all layers but the last fc + for name, param in self.backbone.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + return + + + def _get_basemodel(self, model_name): + try: + model = self.resnet_dict[model_name] + except KeyError: + raise InvalidBackboneError( + "Invalid backbone architecture. Check the config file and pass one of: resnet18 or resnet50", + ) + else: + return model + + def forward(self, x): + return self.backbone(x) From 642ad8a4aee9c77666cfd13344a4ebe05f55210e Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:04:26 -0800 Subject: [PATCH 11/57] add slrm file --- eval_simclr.slrm | 32 +++++++++++++++++++ .../evaluate_simCLR.py => evaluate_simCLR.py | 8 ++--- 2 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 eval_simclr.slrm rename SimCLR/evaluate_simCLR.py => evaluate_simCLR.py (96%) diff --git a/eval_simclr.slrm b/eval_simclr.slrm new file mode 100644 index 0000000..39387be --- /dev/null +++ b/eval_simclr.slrm @@ -0,0 +1,32 @@ +#!/bin/bash + +#SBATCH --job-name=train_sunrgbd +#SBATCH --partition=t4v2 +#SBATCH --time=12:00:00 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=2G +#SBATCH --output=slurm-%N-%j.out +#SBATCH --qos=m + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl/bin/activate + +export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python evaluate_simCLR.py \ +--distributed_mode \ +--batch-size=256 \ +--pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \ No newline at end of file diff --git a/SimCLR/evaluate_simCLR.py b/evaluate_simCLR.py similarity index 96% rename from SimCLR/evaluate_simCLR.py rename to evaluate_simCLR.py index 5910c28..ac265c5 100644 --- a/SimCLR/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -66,16 +66,12 @@ "--seed", default=42, type=int, help="seed for initializing training. " ) -parser.add_argument( - "--log-every-n-steps", default=100, type=int, help="Log every n steps" -) parser.add_argument( "--distributed_mode", action="store_true", help="Enable distributed training" ) parser.add_argument("--distributed_launcher", default="slurm") parser.add_argument("--distributed_backend", default="nccl") -parser.add_argument("--model_dir", default="model_checkpoints") -parser.add_argument("--experiment_name", default="simclr") +parser.add_argument("--pretrained_model_file", default=None, help="Path to the pretrained model file.") parser.add_argument("--linear_evaluation", action="store_true", help="Whether or not to evaluate the linear evaluation of the model.") @@ -185,7 +181,7 @@ def main(): drop_last=False, ) - model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.model_dir, linear_eval=args.linear_evaluation) + model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation) if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): # set the single device scope, otherwise DistributedDataParallel will From ea7cc42fb5a7cbe309f7fd26b6b205a76898817b Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:06:29 -0800 Subject: [PATCH 12/57] update eval file --- evaluate_simCLR.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index ac265c5..552ad25 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -117,10 +117,6 @@ def main(): torch.multiprocessing.set_start_method("spawn") - assert ( - args.n_views == 2 - ), "Only two view training is supported. Please use --n-views 2." - if args.distributed_mode: dist_utils.init_distributed_mode( launcher=args.distributed_launcher, From a9cbba19323fb4349996b79ef48bfdb2aa30a2a6 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:19:33 -0800 Subject: [PATCH 13/57] update eval --- eval_simclr.slrm | 4 +++- evaluate_simCLR.py | 44 +++++++++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 39387be..299367c 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -29,4 +29,6 @@ nvidia-smi srun python evaluate_simCLR.py \ --distributed_mode \ --batch-size=256 \ ---pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \ No newline at end of file +--pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \ +--linear_evaluation \ +--log_every_n_steps=10 \ No newline at end of file diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 552ad25..a552849 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -75,6 +75,9 @@ parser.add_argument("--linear_evaluation", action="store_true", help="Whether or not to evaluate the linear evaluation of the model.") +parser.add_argument( + "--log-every-n-steps", default=100, type=int, help="Log every n steps" +) def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: """Initialize worker processes with a random seed. @@ -191,6 +194,8 @@ def main(): optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0008) criterion = torch.nn.CrossEntropyLoss().cuda(device_id) + n_iter = 0 + for epoch in range(args.epochs): if dist_utils.is_dist_avail_and_initialized(): train_loader.sampler.set_epoch(epoch) @@ -207,25 +212,26 @@ def main(): optimizer.zero_grad() loss.backward() optimizer.step() - - top1_train_accuracy /= counter + 1 - top1_accuracy = 0 - top5_accuracy = 0 - for counter, (x_batch, y_batch) in enumerate(test_loader): - x_batch = x_batch.cuda(device_id) - y_batch = y_batch.cuda(device_id) - - logits = model(x_batch) - - top1, top5 = accuracy(logits, y_batch, topk=(1, 5)) - top1_accuracy += top1[0] - top5_accuracy += top5[0] - - top1_accuracy /= counter + 1 - top5_accuracy /= counter + 1 - print( - f"Epoch {epoch}\tTop1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", - ) + if n_iter % args.log_every_n_steps == 0: + top1_train_accuracy /= counter + 1 + top1_accuracy = 0 + top5_accuracy = 0 + for counter, (x_batch, y_batch) in enumerate(test_loader): + x_batch = x_batch.cuda(device_id) + y_batch = y_batch.cuda(device_id) + + logits = model(x_batch) + + top1, top5 = accuracy(logits, y_batch, topk=(1, 5)) + top1_accuracy += top1[0] + top5_accuracy += top5[0] + + top1_accuracy /= counter + 1 + top5_accuracy /= counter + 1 + print( + f"Epoch {epoch}\t Iter {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", + ) + n_iter += 1 From 90ea717217e19b3a3518a1936f2ba23ff4431332 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:22:11 -0800 Subject: [PATCH 14/57] check eval --- SimCLR/models/resnet_pretrained.py | 1 + eval_simclr.slrm | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index 97f1f80..82078a2 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -20,6 +20,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): # load pretrained weights log = self._load_pretrained() + print(log) assert log.missing_keys == ["fc.weight", "fc.bias"] if linear_eval: diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 299367c..6d52e84 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -30,5 +30,5 @@ srun python evaluate_simCLR.py \ --distributed_mode \ --batch-size=256 \ --pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \ ---linear_evaluation \ ---log_every_n_steps=10 \ No newline at end of file +--log-every-n-steps=10 \ +--linear_evaluation \ No newline at end of file From 1cba4c8612cb06bc82f74184147ff2bb17a7bfb5 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:23:40 -0800 Subject: [PATCH 15/57] check --- SimCLR/models/resnet_pretrained.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index 82078a2..e319f9d 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -17,6 +17,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): } self.backbone = self._get_basemodel(base_model) + print(self.backbone.state_dict().keys()) # load pretrained weights log = self._load_pretrained() @@ -32,6 +33,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): def _load_pretrained(self): checkpoint = torch.load(self.pretrained_dir, map_location='cpu') state_dict = checkpoint["state_dict"] + print(state_dict.keys()) for k in list(state_dict.keys()): if k.startswith("backbone."): if k.startswith("backbone") and not k.startswith("backbone.fc"): From 6a34a85a9b5f068ee20bb682bbbb624a333c5962 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:26:14 -0800 Subject: [PATCH 16/57] check state_dicts --- SimCLR/models/resnet_pretrained.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index e319f9d..6e5df35 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -16,8 +16,8 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): "resnet50": models.resnet50(pretrained=False, num_classes=10), } - self.backbone = self._get_basemodel(base_model) - print(self.backbone.state_dict().keys()) + model = self._get_basemodel(base_model) + print(model.state_dict().keys()) # load pretrained weights log = self._load_pretrained() @@ -29,6 +29,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): self._freeze_backbone() parameters = list(filter(lambda p: p.requires_grad, self.backbone.parameters())) assert len(parameters) == 2 # fc.weight, fc.bias + self.backbone = model def _load_pretrained(self): checkpoint = torch.load(self.pretrained_dir, map_location='cpu') From 86dcdf7c734eb22500422d222c2e47ec1e0e14b6 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:29:52 -0800 Subject: [PATCH 17/57] check --- SimCLR/models/resnet_pretrained.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index 6e5df35..9cd18dc 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -16,8 +16,8 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): "resnet50": models.resnet50(pretrained=False, num_classes=10), } - model = self._get_basemodel(base_model) - print(model.state_dict().keys()) + self.backbone = self._get_basemodel(base_model) + print(self.backbone.state_dict().keys()) # load pretrained weights log = self._load_pretrained() @@ -29,17 +29,16 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): self._freeze_backbone() parameters = list(filter(lambda p: p.requires_grad, self.backbone.parameters())) assert len(parameters) == 2 # fc.weight, fc.bias - self.backbone = model def _load_pretrained(self): checkpoint = torch.load(self.pretrained_dir, map_location='cpu') state_dict = checkpoint["state_dict"] print(state_dict.keys()) for k in list(state_dict.keys()): - if k.startswith("backbone."): - if k.startswith("backbone") and not k.startswith("backbone.fc"): + if k.startswith("module.backbone."): + if k.startswith("module.backbone") and not k.startswith("module.backbone.fc"): # remove prefix - state_dict[k[len("backbone.") :]] = state_dict[k] + state_dict[k[len("module.backbone.") :]] = state_dict[k] del state_dict[k] log = self.backbone.load_state_dict(state_dict, strict=False) return log From 4e683a4e75ed60d726630c0e6445a37b9803804e Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:32:34 -0800 Subject: [PATCH 18/57] edit eval classes --- SimCLR/models/resnet_pretrained.py | 10 ++++------ evaluate_simCLR.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index 9cd18dc..4c3ea45 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -6,22 +6,21 @@ class PretrainedResNet(nn.Module): - def __init__(self, base_model, pretrained_dir, linear_eval=True): + def __init__(self, base_model, pretrained_dir, linear_eval=True, num_classes=10): super(PretrainedResNet, self).__init__() self.pretrained_dir = pretrained_dir self.resnet_dict = { - "resnet18": models.resnet18(pretrained=False, num_classes=10), - "resnet50": models.resnet50(pretrained=False, num_classes=10), + "resnet18": models.resnet18(pretrained=False, num_classes=num_classes), + "resnet50": models.resnet50(pretrained=False, num_classes=num_classes), } self.backbone = self._get_basemodel(base_model) - print(self.backbone.state_dict().keys()) # load pretrained weights log = self._load_pretrained() - print(log) + assert log.missing_keys == ["fc.weight", "fc.bias"] if linear_eval: @@ -33,7 +32,6 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True): def _load_pretrained(self): checkpoint = torch.load(self.pretrained_dir, map_location='cpu') state_dict = checkpoint["state_dict"] - print(state_dict.keys()) for k in list(state_dict.keys()): if k.startswith("module.backbone."): if k.startswith("module.backbone") and not k.startswith("module.backbone.fc"): diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index a552849..76d724c 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -79,6 +79,7 @@ "--log-every-n-steps", default=100, type=int, help="Log every n steps" ) + def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: """Initialize worker processes with a random seed. @@ -179,8 +180,14 @@ def main(): pin_memory=False, drop_last=False, ) - - model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation) + if args.dataset_name == "cifar10": + num_classes = 10 + elif args.dataset_name == "stl10": + num_classes = 10 + elif args.dataset_name == "imagenet": + num_classes = 1000 + + model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation, num_classes=num_classes) if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): # set the single device scope, otherwise DistributedDataParallel will From ecfab47e3bdec13f6d074c09e9af8d4e26e79bac Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:38:52 -0800 Subject: [PATCH 19/57] check --- evaluate_simCLR.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 76d724c..343bfef 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -212,6 +212,9 @@ def main(): y_batch = y_batch.cuda(device_id) logits = model(x_batch) + print(y_batch) + print(logits.shape) + print(logits) loss = criterion(logits, y_batch) top1 = accuracy(logits, y_batch, topk=(1,)) top1_train_accuracy += top1[0] From 20795afce425e2c80a2f644b8772214c256d8b86 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:46:16 -0800 Subject: [PATCH 20/57] update slrm --- eval_simclr.slrm | 3 ++- evaluate_simCLR.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 6d52e84..cbeac19 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -8,7 +8,8 @@ #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu=2G -#SBATCH --output=slurm-%N-%j.out +#SBATCH --output=./runs/eval_slurm-%N-%j.out +#SBATCH --error=./runs/eval_slurm-%N-%j.err #SBATCH --qos=m PY_ARGS=${@:1} diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 343bfef..76d724c 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -212,9 +212,6 @@ def main(): y_batch = y_batch.cuda(device_id) logits = model(x_batch) - print(y_batch) - print(logits.shape) - print(logits) loss = criterion(logits, y_batch) top1 = accuracy(logits, y_batch, topk=(1,)) top1_train_accuracy += top1[0] From 38a86e6cd508f24f7f92453441b1ff43b924b376 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 17:55:50 -0800 Subject: [PATCH 21/57] Update eval --- evaluate_simCLR.py | 61 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 76d724c..4702802 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -2,10 +2,12 @@ import random from functools import partial +import os import torch from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 from torch.utils.data.distributed import DistributedSampler from torchvision import models +import shutil from SimCLR import distributed as dist_utils from SimCLR.data_aug.supervised_dataset import SupervisedDataset @@ -78,6 +80,8 @@ parser.add_argument( "--log-every-n-steps", default=100, type=int, help="Log every n steps" ) +parser.add_argument("--model_dir", default="model_checkpoints") +parser.add_argument("--experiment_name", default="simclr") def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: @@ -113,6 +117,11 @@ def accuracy(output, target, topk=(1,)): correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / batch_size)) return res + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") def main(): @@ -203,6 +212,10 @@ def main(): n_iter = 0 + log_dir = os.path.join(args.model_dir, args.experiment_name) + if not os.path.exists(log_dir): + os.mkdir(log_dir) + for epoch in range(args.epochs): if dist_utils.is_dist_avail_and_initialized(): train_loader.sampler.set_epoch(epoch) @@ -220,26 +233,42 @@ def main(): loss.backward() optimizer.step() if n_iter % args.log_every_n_steps == 0: - top1_train_accuracy /= counter + 1 - top1_accuracy = 0 - top5_accuracy = 0 - for counter, (x_batch, y_batch) in enumerate(test_loader): - x_batch = x_batch.cuda(device_id) - y_batch = y_batch.cuda(device_id) - - logits = model(x_batch) - - top1, top5 = accuracy(logits, y_batch, topk=(1, 5)) - top1_accuracy += top1[0] - top5_accuracy += top5[0] - - top1_accuracy /= counter + 1 - top5_accuracy /= counter + 1 + temp_train_acc = top1_train_accuracy / (counter + 1) print( - f"Epoch {epoch}\t Iter {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", + f"Iter {n_iter}\t Top1 Train accuracy {temp_train_acc.item()}", ) n_iter += 1 + top1_train_accuracy /= counter + 1 + top1_accuracy = 0 + top5_accuracy = 0 + for counter, (x_batch, y_batch) in enumerate(test_loader): + x_batch = x_batch.cuda(device_id) + y_batch = y_batch.cuda(device_id) + + logits = model(x_batch) + + top1, top5 = accuracy(logits, y_batch, topk=(1, 5)) + top1_accuracy += top1[0] + top5_accuracy += top5[0] + + top1_accuracy /= counter + 1 + top5_accuracy /= counter + 1 + print( + f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", + ) + checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch) + save_checkpoint( + { + "n_iter": n_iter, + "arch": args.arch, + "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), + }, + is_best=False, + filename=os.path.join(log_dir, checkpoint_name), + ) + From d85de1708a7c651c4dd0dcbb474b2fad6ab0cdd1 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 18:00:16 -0800 Subject: [PATCH 22/57] edit --- eval_simclr.slrm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index cbeac19..f7208b8 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -32,4 +32,6 @@ srun python evaluate_simCLR.py \ --batch-size=256 \ --pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \ --log-every-n-steps=10 \ ---linear_evaluation \ No newline at end of file +--linear_evaluation \ +--model_dir="/ssd003/projects/aieng/genssl/experiments" \ +--experiment_name="simclr" \ No newline at end of file From e359965c0f91b5b649261bbeaf5aad0e9911587e Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 18:02:53 -0800 Subject: [PATCH 23/57] edit slrm --- eval_simclr.slrm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index f7208b8..e99853e 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -8,8 +8,8 @@ #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu=2G -#SBATCH --output=./runs/eval_slurm-%N-%j.out -#SBATCH --error=./runs/eval_slurm-%N-%j.err +#SBATCH --output=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.out +#SBATCH --error=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.err #SBATCH --qos=m PY_ARGS=${@:1} @@ -31,7 +31,7 @@ srun python evaluate_simCLR.py \ --distributed_mode \ --batch-size=256 \ --pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \ ---log-every-n-steps=10 \ +--log-every-n-steps=100 \ --linear_evaluation \ --model_dir="/ssd003/projects/aieng/genssl/experiments" \ --experiment_name="simclr" \ No newline at end of file From c78abd06c780e907243dda80a7bc5164ed8521bc Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 31 Jan 2024 18:03:50 -0800 Subject: [PATCH 24/57] correct sample slrm --- eval_simclr.slrm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index e99853e..5c1010a 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -8,8 +8,8 @@ #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu=2G -#SBATCH --output=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.out -#SBATCH --error=/ssd003/projects/aieng/genssl/experiments/simclr/eval_slurm-%N-%j.err +#SBATCH --output=./runs/eval_slurm-%N-%j.out +#SBATCH --error=./runs/eval_slurm-%N-%j.err #SBATCH --qos=m PY_ARGS=${@:1} From a1fa4ea1abba151f53b0d69bff55a57a3f72625c Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 1 Feb 2024 11:13:19 -0800 Subject: [PATCH 25/57] fix multi gpu --- SimCLR/data_aug/rcdm_aug.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index 8ea9e23..43d6dda 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -16,7 +16,9 @@ def __init__(self, config, device_id): self.device_id = device_id # Load SSL model - self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir).cuda(self.device_id).eval() + self.ssl_model = get_model(self.config.type_model, self.config.use_head, self.config.model_dir) + self.ssl_model = self.ssl_model.cuda(self.device_id) + self.ssl_model.eval() for p in self.ssl_model.parameters(): p.requires_grad = False @@ -33,7 +35,7 @@ def __init__(self, config, device_id): else: trained_model = torch.load(self.config.model_path, map_location="cpu") self.model.load_state_dict(trained_model, strict=True) - self.model.cuda(self.device_id) + self.model = self.model.cuda(self.device_id) def __call__(self, img): """ From f96c12175866ade196cf4256a6adc2978afb81ca Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 12 Feb 2024 12:25:26 -0800 Subject: [PATCH 26/57] Delete pytest --- .pre-commit-config.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7d459c0..45f0c42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,15 +40,6 @@ repos: - id: nbqa-ruff args: [--fix] - - repo: local - hooks: - - id: pytest - name: pytest - entry: python3 -m pytest - language: system - pass_filenames: false - always_run: true - exclude: | (?x)( ^rcdm/| From 73eb9e44567ffe4bf3375296f68fd83b54e0f2d9 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 09:20:05 -0800 Subject: [PATCH 27/57] update eval --- .../data_aug/contrastive_learning_dataset.py | 1 - evaluate_simCLR.py | 51 ++++++++++++++----- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index 8c67ff0..ef488b1 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -7,7 +7,6 @@ from SimCLR.data_aug.rcdm_config import get_config from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator from SimCLR.exceptions.exceptions import InvalidDatasetSelection -import random class ContrastiveLearningDataset: def __init__(self, root_folder): diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 4702802..8a9771a 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -51,7 +51,11 @@ help="number of data loading workers", ) parser.add_argument( - "--epochs", default=100, type=int, metavar="N", help="number of total epochs to run" + "--epochs", + default=100, + type=int, + metavar="N", + help="number of total epochs to run", ) parser.add_argument( "-b", @@ -63,23 +67,33 @@ "batch size of all GPUs on the current node when " "using Data Parallel or Distributed Data Parallel", ) - parser.add_argument( - "--seed", default=42, type=int, help="seed for initializing training. " + "--seed", + default=42, + type=int, + help="seed for initializing training. ", +) +parser.add_argument( + "--log-every-n-steps", + default=100, + type=int, + help="Log every n steps", ) - parser.add_argument( - "--distributed_mode", action="store_true", help="Enable distributed training" + "--distributed_mode", + action="store_true", + help="Enable distributed training", ) parser.add_argument("--distributed_launcher", default="slurm") parser.add_argument("--distributed_backend", default="nccl") -parser.add_argument("--pretrained_model_file", default=None, help="Path to the pretrained model file.") -parser.add_argument("--linear_evaluation", - action="store_true", - help="Whether or not to evaluate the linear evaluation of the model.") parser.add_argument( - "--log-every-n-steps", default=100, type=int, help="Log every n steps" -) + "--pretrained_model_file", + default=None, + help="Path to the pretrained model file.") +parser.add_argument( + "--linear_evaluation", + action="store_true", + help="Whether or not to evaluate the linear evaluation of the model.") parser.add_argument("--model_dir", default="model_checkpoints") parser.add_argument("--experiment_name", default="simclr") @@ -196,7 +210,11 @@ def main(): elif args.dataset_name == "imagenet": num_classes = 1000 - model = PretrainedResNet(base_model=args.arch, pretrained_dir = args.pretrained_model_file, linear_eval=args.linear_evaluation, num_classes=num_classes) + model = PretrainedResNet( + base_model=args.arch, + pretrained_dir = args.pretrained_model_file, + linear_eval=args.linear_evaluation, + num_classes=num_classes) if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): # set the single device scope, otherwise DistributedDataParallel will @@ -207,7 +225,12 @@ def main(): else: model = model.cuda() - optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.0008) + optimizer = torch.optim.Adam( + model.parameters(), + lr=3e-4, + weight_decay=0.0008, + ) + criterion = torch.nn.CrossEntropyLoss().cuda(device_id) n_iter = 0 @@ -260,7 +283,7 @@ def main(): checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch) save_checkpoint( { - "n_iter": n_iter, + "n_epoch": epoch, "arch": args.arch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), From e67b59831ce8bfaacdb052b54d344b63decbfd10 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 09:30:31 -0800 Subject: [PATCH 28/57] clean code --- SimCLR/data_aug/contrastive_learning_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index ef488b1..dd89386 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -7,7 +7,6 @@ from SimCLR.data_aug.rcdm_config import get_config from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator from SimCLR.exceptions.exceptions import InvalidDatasetSelection - class ContrastiveLearningDataset: def __init__(self, root_folder): self.root_folder = root_folder From 56688ca2d61b51669a9a05d3fd50421b4747a284 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 09:30:53 -0800 Subject: [PATCH 29/57] clean code --- SimCLR/data_aug/contrastive_learning_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index dd89386..e6fbb3c 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -7,6 +7,8 @@ from SimCLR.data_aug.rcdm_config import get_config from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator from SimCLR.exceptions.exceptions import InvalidDatasetSelection + + class ContrastiveLearningDataset: def __init__(self, root_folder): self.root_folder = root_folder From 084d05c96173638252b4ef000544d165d04b7dc0 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 09:31:14 -0800 Subject: [PATCH 30/57] clean code --- SimCLR/data_aug/contrastive_learning_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/data_aug/contrastive_learning_dataset.py index e6fbb3c..c9bc448 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/data_aug/contrastive_learning_dataset.py @@ -113,4 +113,5 @@ def get_dataset( except KeyError: raise InvalidDatasetSelection() else: - return dataset_fn() \ No newline at end of file + return dataset_fn() + \ No newline at end of file From 1b9ccf35807ab4fcbd8e11c74230eeeff229d4c7 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 09:32:58 -0800 Subject: [PATCH 31/57] clean code --- SimCLR/simclr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index ecb2a80..e2f1eb7 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -63,7 +63,6 @@ def train(self, train_loader): self.scheduler.get_last_lr()[0], global_step=n_iter, ) - n_iter += 1 # warmup for the first 10 epochs From c707014586958f9774133905e300177e86d427db Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 09:33:48 -0800 Subject: [PATCH 32/57] clean code --- run_simCLR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_simCLR.py b/run_simCLR.py index 4d2615c..de526c5 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -169,7 +169,7 @@ def main(): assert ( args.n_views == 2 - ), "Only two view training is supported. Please use --n-views 2." + ), "Only two view training is supported. Please use --n-views 2." if args.distributed_mode: dist_utils.init_distributed_mode( From b36c694649aaf748cda95e5fcda19ac296ee4ac4 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 12:15:03 -0800 Subject: [PATCH 33/57] debug rcdm error --- SimCLR/data_aug/rcdm_aug.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index ccf2c9d..dd9f658 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -89,9 +89,11 @@ def __call__(self, img): if not self.config.use_ddim else self.diffusion.ddim_sample_loop ) - + print("1",img.shape) img = img.unsqueeze(0).repeat(1, 1, 1, 1) + print("2",img.shape) img = self.preprocess_input_image(img).cuda(self.device_id) + print("3",img.shape) model_kwargs = {} with torch.no_grad(): From cf9713d20a35e653469cce60507c363b9a1b1f4c Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 12:27:01 -0800 Subject: [PATCH 34/57] edit --- SimCLR/data_aug/rcdm_aug.py | 1 + 1 file changed, 1 insertion(+) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index dd9f658..bef1f84 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -89,6 +89,7 @@ def __call__(self, img): if not self.config.use_ddim else self.diffusion.ddim_sample_loop ) + print(img) print("1",img.shape) img = img.unsqueeze(0).repeat(1, 1, 1, 1) print("2",img.shape) From f8e8eb1d29cacddda1c2905b93bcab49281956d9 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 12:40:51 -0800 Subject: [PATCH 35/57] edit --- SimCLR/data_aug/rcdm_aug.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index bef1f84..dfd396f 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -89,12 +89,10 @@ def __call__(self, img): if not self.config.use_ddim else self.diffusion.ddim_sample_loop ) - print(img) + img = self.preprocess_input_image(img).cuda(self.device_id) print("1",img.shape) img = img.unsqueeze(0).repeat(1, 1, 1, 1) print("2",img.shape) - img = self.preprocess_input_image(img).cuda(self.device_id) - print("3",img.shape) model_kwargs = {} with torch.no_grad(): From 575592344601464b211d3fa5036743723778def5 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 12:56:02 -0800 Subject: [PATCH 36/57] delete normalize --- SimCLR/data_aug/rcdm_aug.py | 1 - 1 file changed, 1 deletion(-) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index dfd396f..78b3072 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -63,7 +63,6 @@ def preprocess_input_image(self, input_image, size=224): data_utils.CenterCropLongEdge(), transforms.Resize((size, size)), transforms.ToTensor(), - transforms.Normalize(self.config.norm_mean, self.config.norm_std), ] ) tensor_image = transform_list(input_image) From 25d250f9a4c9bbfad4c39782567e34206aeb5394 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 13:06:04 -0800 Subject: [PATCH 37/57] edit --- SimCLR/data_aug/rcdm_aug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index 78b3072..9494118 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -90,7 +90,7 @@ def __call__(self, img): ) img = self.preprocess_input_image(img).cuda(self.device_id) print("1",img.shape) - img = img.unsqueeze(0).repeat(1, 1, 1, 1) + img = img.repeat(1, 1, 1, 1) print("2",img.shape) model_kwargs = {} From 5754ae170b53518b597b6628ef7d5a425000dea4 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 13 Feb 2024 13:18:22 -0800 Subject: [PATCH 38/57] delete print --- SimCLR/data_aug/rcdm_aug.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/data_aug/rcdm_aug.py index 9494118..4cb0ed2 100644 --- a/SimCLR/data_aug/rcdm_aug.py +++ b/SimCLR/data_aug/rcdm_aug.py @@ -89,9 +89,7 @@ def __call__(self, img): else self.diffusion.ddim_sample_loop ) img = self.preprocess_input_image(img).cuda(self.device_id) - print("1",img.shape) img = img.repeat(1, 1, 1, 1) - print("2",img.shape) model_kwargs = {} with torch.no_grad(): @@ -104,5 +102,4 @@ def __call__(self, img): model_kwargs=model_kwargs, ) - print("Sampling completed!") return sample.squeeze(0) From 0642905efc31a38d54911b7591245e18639d04cb Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 20 Feb 2024 10:52:57 -0800 Subject: [PATCH 39/57] update evaluation --- SimCLR/data_aug/supervised_dataset.py | 2 +- SimCLR/models/resnet_pretrained.py | 6 +- SimCLR/simclr.py | 3 +- eval_simclr.slrm | 8 +-- evaluate_simCLR.py | 80 +++++++++++++++------------ run_simCLR.py | 11 ++++ tests/test_evaluation.py | 34 ++++++++++++ 7 files changed, 97 insertions(+), 47 deletions(-) create mode 100644 tests/test_evaluation.py diff --git a/SimCLR/data_aug/supervised_dataset.py b/SimCLR/data_aug/supervised_dataset.py index 6b1713a..774a113 100644 --- a/SimCLR/data_aug/supervised_dataset.py +++ b/SimCLR/data_aug/supervised_dataset.py @@ -16,7 +16,7 @@ def get_transform(size): size (int): Image size. """ transform_list = [ - transforms.Resize(size=(size,size)), + transforms.CenterCrop(size=size), transforms.ToTensor(), ] diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index 4c3ea45..840577a 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -6,10 +6,10 @@ class PretrainedResNet(nn.Module): - def __init__(self, base_model, pretrained_dir, linear_eval=True, num_classes=10): + def __init__(self, base_model, pretrained_model_file, linear_eval=True, num_classes=10): super(PretrainedResNet, self).__init__() - self.pretrained_dir = pretrained_dir + self.pretrained_model_file = pretrained_model_file self.resnet_dict = { "resnet18": models.resnet18(pretrained=False, num_classes=num_classes), @@ -30,7 +30,7 @@ def __init__(self, base_model, pretrained_dir, linear_eval=True, num_classes=10) assert len(parameters) == 2 # fc.weight, fc.bias def _load_pretrained(self): - checkpoint = torch.load(self.pretrained_dir, map_location='cpu') + checkpoint = torch.load(self.pretrained_model_file, map_location='cpu') state_dict = checkpoint["state_dict"] for k in list(state_dict.keys()): if k.startswith("module.backbone."): diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index e2f1eb7..21a8652 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -12,13 +12,12 @@ class SimCLR(object): - def __init__(self, *args, **kwargs): + def __init__(self, log_dir, *args, **kwargs): self.args = kwargs["args"] self.model = kwargs["model"] self.optimizer = kwargs["optimizer"] self.scheduler = kwargs["scheduler"] self.device_id = kwargs["device_id"] - log_dir = os.path.join(self.args.model_dir, self.args.experiment_name) self.writer = SummaryWriter(log_dir) self.criterion = loss.SimCLRContrastiveLoss(self.args.temperature).cuda( self.device_id diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 5c1010a..0b93ac3 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -30,8 +30,6 @@ nvidia-smi srun python evaluate_simCLR.py \ --distributed_mode \ --batch-size=256 \ ---pretrained_model_file="/ssd003/projects/aieng/genssl/experiments/simclr/checkpoint_epoch_0003.pth.tar" \ ---log-every-n-steps=100 \ ---linear_evaluation \ ---model_dir="/ssd003/projects/aieng/genssl/experiments" \ ---experiment_name="simclr" \ No newline at end of file +--pretrained_model_dir="/ssd003/projects/aieng/genssl/experiments/simclr" \ +--pretrained_model_name='/checkpoint_epoch_0003.pth.tar' \ +--linear_evaluation \ No newline at end of file diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 8a9771a..ee15949 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -12,7 +12,6 @@ from SimCLR import distributed as dist_utils from SimCLR.data_aug.supervised_dataset import SupervisedDataset from SimCLR.models.resnet_pretrained import PretrainedResNet -from SimCLR.simclr import SimCLR model_names = sorted( @@ -67,18 +66,30 @@ "batch size of all GPUs on the current node when " "using Data Parallel or Distributed Data Parallel", ) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.0003, + type=float, + metavar="LR", + help="initial learning rate", + dest="lr", +) +parser.add_argument( + "--wd", + "--weight-decay", + default=8e-4, + type=float, + metavar="W", + help="weight decay (default: 1e-4)", + dest="weight_decay", +) parser.add_argument( "--seed", default=42, type=int, help="seed for initializing training. ", ) -parser.add_argument( - "--log-every-n-steps", - default=100, - type=int, - help="Log every n steps", -) parser.add_argument( "--distributed_mode", action="store_true", @@ -87,15 +98,21 @@ parser.add_argument("--distributed_launcher", default="slurm") parser.add_argument("--distributed_backend", default="nccl") parser.add_argument( - "--pretrained_model_file", + "--pretrained_model_dir", + default=None, + help="Path to the pretrained model directory.") +parser.add_argument( + "--pretrained_model_name", default=None, - help="Path to the pretrained model file.") + help="Name of pretrained model.") parser.add_argument( "--linear_evaluation", action="store_true", help="Whether or not to evaluate the linear evaluation of the model.") -parser.add_argument("--model_dir", default="model_checkpoints") -parser.add_argument("--experiment_name", default="simclr") +parser.add_argument( + "--enable_checkpointing", + action="store_true", + help="Whether or not to enable checkpointing of the model.") def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: @@ -132,10 +149,8 @@ def accuracy(output, target, topk=(1,)): res.append(correct_k.mul_(100.0 / batch_size)) return res -def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): +def save_checkpoint(state, filename="checkpoint.pth.tar"): torch.save(state, filename) - if is_best: - shutil.copyfile(filename, "model_best.pth.tar") def main(): @@ -212,7 +227,7 @@ def main(): model = PretrainedResNet( base_model=args.arch, - pretrained_dir = args.pretrained_model_file, + pretrained_model_file = os.path.join(args.pretrained_model_dir, args.pretrained_model_name), linear_eval=args.linear_evaluation, num_classes=num_classes) @@ -227,17 +242,15 @@ def main(): optimizer = torch.optim.Adam( model.parameters(), - lr=3e-4, - weight_decay=0.0008, + lr=args.lr, + weight_decay=args.weight_decay, ) criterion = torch.nn.CrossEntropyLoss().cuda(device_id) n_iter = 0 - log_dir = os.path.join(args.model_dir, args.experiment_name) - if not os.path.exists(log_dir): - os.mkdir(log_dir) + log_dir = args.pretrained_model_dir for epoch in range(args.epochs): if dist_utils.is_dist_avail_and_initialized(): @@ -255,11 +268,6 @@ def main(): optimizer.zero_grad() loss.backward() optimizer.step() - if n_iter % args.log_every_n_steps == 0: - temp_train_acc = top1_train_accuracy / (counter + 1) - print( - f"Iter {n_iter}\t Top1 Train accuracy {temp_train_acc.item()}", - ) n_iter += 1 top1_train_accuracy /= counter + 1 @@ -280,17 +288,17 @@ def main(): print( f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", ) - checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch) - save_checkpoint( - { - "n_epoch": epoch, - "arch": args.arch, - "state_dict": model.state_dict(), - "optimizer": optimizer.state_dict(), - }, - is_best=False, - filename=os.path.join(log_dir, checkpoint_name), - ) + if args.enable_checkpointing: + checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch) + save_checkpoint( + { + "n_epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), + }, + filename=os.path.join(log_dir, checkpoint_name), + ) diff --git a/run_simCLR.py b/run_simCLR.py index 22b8601..4f52375 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -3,6 +3,8 @@ import random from functools import partial +import os +from datetime import datetime import torch from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 from torch.utils.data.distributed import DistributedSampler @@ -173,6 +175,14 @@ def main(): args = parser.parse_args() print(args) + # Create a directory to save the model checkpoints and logs + now = datetime.now() + dt_string = now.strftime("%d/%m/%Y_%H:%M") + log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string) + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + # Set the start method to spawn for distributed training torch.multiprocessing.set_start_method("spawn") assert ( @@ -252,6 +262,7 @@ def main(): ) simclr = SimCLR( + log_dir=log_dir, model=model, optimizer=optimizer, scheduler=scheduler, diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py new file mode 100644 index 0000000..b0cfb31 --- /dev/null +++ b/tests/test_evaluation.py @@ -0,0 +1,34 @@ +import pytest +import torch +from evaluate_simCLR import accuracy + +def test_accuracy()-> None: + # Create sample data + output = torch.tensor([[0.1, 0.5, 0.3], [0.2, 0.6, 0.2]]) + target = torch.tensor([1, 2]) + topk = (1,) + + # Calculate accuracy + res = accuracy(output, target, topk=topk) + + # Check if the result matches the expected accuracy + expected_accuracy = [50.0] + assert res == expected_accuracy + +def test_accuracy_topk_5(): + # Create sample data + output = torch.tensor([[0.1, 0.5, 0.3, 0.1, 0.4, 0.5, 0.2, 0.3, 0.1, 0.9], + [0.2, 0.6, 0.2, 0.1, 0.3, 0.6, 0.2, 0.4, 0.1, 0.8], + [0.3, 0.4, 0.3, 0.2, 0.5, 0.3, 0.1, 0.7, 0.2, 0.6], + [0.4, 0.3, 0.3, 0.5, 0.6, 0.1, 0.2, 0.8, 0.1, 0.7]]) + target = torch.tensor([6, 7, 8, 9]) # Targets that are not in the top 5 + topk = (5,) + + # Calculate accuracy + res = accuracy(output, target, topk=topk) + print(res) + + # Check if the result matches the expected accuracy + # In this case, the expected accuracy is 25.0 for all samples + expected_accuracy = [50.0] + assert res == expected_accuracy \ No newline at end of file From 245eb54e6fcb587e69110853d1a5a659508a46d8 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 20 Feb 2024 10:54:29 -0800 Subject: [PATCH 40/57] update formating --- eval_simclr.slrm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 0b93ac3..46ba26b 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -31,5 +31,5 @@ srun python evaluate_simCLR.py \ --distributed_mode \ --batch-size=256 \ --pretrained_model_dir="/ssd003/projects/aieng/genssl/experiments/simclr" \ ---pretrained_model_name='/checkpoint_epoch_0003.pth.tar' \ +--pretrained_model_name="/checkpoint_epoch_0003.pth.tar" \ --linear_evaluation \ No newline at end of file From 33c0c9386b30936567d2d2b1d737113592bedbe0 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 09:41:24 -0800 Subject: [PATCH 41/57] update logging --- run_simCLR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_simCLR.py b/run_simCLR.py index 4f52375..b4f51d2 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -177,7 +177,7 @@ def main(): # Create a directory to save the model checkpoints and logs now = datetime.now() - dt_string = now.strftime("%d/%m/%Y_%H:%M") + dt_string = now.strftime("%d_%m_%Y_%H:%M") log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string) if not os.path.exists(log_dir): os.makedirs(log_dir) From 31409dd0aa851c2680cbeeec019d2864fb340aae Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 10:24:54 -0800 Subject: [PATCH 42/57] Update bash files --- eval_simclr.slrm | 25 +++++++++++++++++-------- evaluate_simCLR.py | 6 +++++- run_simCLR.py | 2 +- train_simclr.slrm | 15 +++++++++------ 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 46ba26b..bb70619 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -15,7 +15,7 @@ PY_ARGS=${@:1} # load virtual environment -source /ssd003/projects/aieng/envs/genssl/bin/activate +source /ssd003/projects/aieng/envs/genssl2/bin/activate export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend export CUDA_LAUNCH_BLOCKING=1 @@ -26,10 +26,19 @@ export MASTER_PORT=45679 export PYTHONPATH="." nvidia-smi -# “srun” executes the script times -srun python evaluate_simCLR.py \ ---distributed_mode \ ---batch-size=256 \ ---pretrained_model_dir="/ssd003/projects/aieng/genssl/experiments/simclr" \ ---pretrained_model_name="/checkpoint_epoch_0003.pth.tar" \ ---linear_evaluation \ No newline at end of file +files=$(ls checkpoint_epoch_*) + +# Loop through each file and pass it as a parameter to the rest of the script +for file in $files +do + # “srun” executes the script times + srun python evaluate_simCLR.py \ + --distributed_mode \ + --batch-size=256 \ + --pretrained_model_dir="/projects/imagenet_synthetic/train_models" \ + --experiment_name="simclr/23_02_2024_13:02" \ + --pretrained_model_name=$file \ + --linear_evaluation \ + --arch="resnet50" + # Add your processing logic here +done \ No newline at end of file diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index ee15949..a87bb83 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -105,6 +105,10 @@ "--pretrained_model_name", default=None, help="Name of pretrained model.") +parser.add_argument( + "--experiment_name", + default=None, + help="Name of the experiment.") parser.add_argument( "--linear_evaluation", action="store_true", @@ -227,7 +231,7 @@ def main(): model = PretrainedResNet( base_model=args.arch, - pretrained_model_file = os.path.join(args.pretrained_model_dir, args.pretrained_model_name), + pretrained_model_file = os.path.join(args.pretrained_model_dir, args.experiment_name, args.pretrained_model_name), linear_eval=args.linear_evaluation, num_classes=num_classes) diff --git a/run_simCLR.py b/run_simCLR.py index b4f51d2..02c624a 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -260,7 +260,7 @@ def main(): scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=len(train_loader), eta_min=0, last_epoch=-1 ) - + print(device_id,flush=True) simclr = SimCLR( log_dir=log_dir, model=model, diff --git a/train_simclr.slrm b/train_simclr.slrm index 477a1fb..cf0cf25 100644 --- a/train_simclr.slrm +++ b/train_simclr.slrm @@ -1,14 +1,15 @@ #!/bin/bash #SBATCH --job-name=train_sunrgbd -#SBATCH --partition=t4v2 -#SBATCH --time=12:00:00 +#SBATCH --partition=a100 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=2G -#SBATCH --output=slurm-%N-%j.out +#SBATCH --mem-per-cpu=8G +#SBATCH --output=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.out +#SBATCH --error=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.err +#SBATCH --qos=a100_arashaf PY_ARGS=${@:1} @@ -28,5 +29,7 @@ nvidia-smi srun python run_simCLR.py \ --fp16-precision \ --distributed_mode \ ---batch-size=4 \ ---icgan_augmentation +--batch-size=256 \ +--model_dir="/projects/imagenet_synthetic/train_models" \ +--experiment_name="simclr" \ +--arch="resnet50" From eb9a4b6633a56b823263aacc0040ea2a1e121df4 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 11:44:54 -0800 Subject: [PATCH 43/57] Update augmentation and saving file --- .../contrastive_learning_dataset.py | 12 ++--- SimCLR/datasets/data_aug/center_crop.py | 44 +++++++++++++++++++ .../{ => datasets}/data_aug/gaussian_blur.py | 0 SimCLR/{ => datasets}/data_aug/icgan_aug.py | 0 .../{ => datasets}/data_aug/icgan_config.py | 0 SimCLR/{ => datasets}/data_aug/rcdm_aug.py | 0 SimCLR/{ => datasets}/data_aug/rcdm_config.py | 0 .../supervised_dataset.py | 5 ++- .../{data_aug => datasets}/view_generator.py | 0 eval_simclr.slrm | 17 +++---- evaluate_simCLR.py | 3 +- run_simCLR.py | 5 +-- train_simclr.slrm | 17 +++---- 13 files changed, 75 insertions(+), 28 deletions(-) rename SimCLR/{data_aug => datasets}/contrastive_learning_dataset.py (91%) create mode 100644 SimCLR/datasets/data_aug/center_crop.py rename SimCLR/{ => datasets}/data_aug/gaussian_blur.py (100%) rename SimCLR/{ => datasets}/data_aug/icgan_aug.py (100%) rename SimCLR/{ => datasets}/data_aug/icgan_config.py (100%) rename SimCLR/{ => datasets}/data_aug/rcdm_aug.py (100%) rename SimCLR/{ => datasets}/data_aug/rcdm_config.py (100%) rename SimCLR/{data_aug => datasets}/supervised_dataset.py (91%) rename SimCLR/{data_aug => datasets}/view_generator.py (100%) diff --git a/SimCLR/data_aug/contrastive_learning_dataset.py b/SimCLR/datasets/contrastive_learning_dataset.py similarity index 91% rename from SimCLR/data_aug/contrastive_learning_dataset.py rename to SimCLR/datasets/contrastive_learning_dataset.py index c9bc448..3723652 100644 --- a/SimCLR/data_aug/contrastive_learning_dataset.py +++ b/SimCLR/datasets/contrastive_learning_dataset.py @@ -1,11 +1,11 @@ from torchvision import datasets, transforms -from SimCLR.data_aug.gaussian_blur import GaussianBlur -from SimCLR.data_aug.icgan_aug import ICGANInference -from SimCLR.data_aug.icgan_config import get_icgan_config -from SimCLR.data_aug.rcdm_aug import RCDMInference -from SimCLR.data_aug.rcdm_config import get_config -from SimCLR.data_aug.view_generator import ContrastiveLearningViewGenerator +from SimCLR.datasets.data_aug.gaussian_blur import GaussianBlur +from SimCLR.datasets.data_aug.icgan_aug import ICGANInference +from SimCLR.datasets.data_aug.icgan_config import get_icgan_config +from SimCLR.datasets.data_aug.rcdm_aug import RCDMInference +from SimCLR.datasets.data_aug.rcdm_config import get_config +from SimCLR.datasets.view_generator import ContrastiveLearningViewGenerator from SimCLR.exceptions.exceptions import InvalidDatasetSelection diff --git a/SimCLR/datasets/data_aug/center_crop.py b/SimCLR/datasets/data_aug/center_crop.py new file mode 100644 index 0000000..783749c --- /dev/null +++ b/SimCLR/datasets/data_aug/center_crop.py @@ -0,0 +1,44 @@ +import torch.nn.functional as F +import torchvision +import torch + +class CostumeCenterCrop(torch.nn.Module): + def __init__(self, size=None, ratio="1:1"): + super().__init__() + self.size = size + self.ratio = ratio + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be cropped. + + Returns: + PIL Image or Tensor: Cropped image. + """ + if self.size is None: + if isinstance(img, torch.Tensor): + h, w = img.shape[-2:] + else: + w, h = img.size + ratio = self.ratio.split(":") + ratio = float(ratio[0]) / float(ratio[1]) + # Size must match the ratio while cropping to the edge of the image + ratioed_w = int(h * ratio) + ratioed_h = int(w / ratio) + if w>=h: + if ratioed_h <= h: + size = (ratioed_h, w) + else: + size = (h, ratioed_w) + else: + if ratioed_w <= w: + size = (h, ratioed_w) + else: + size = (ratioed_h, w) + else: + size = self.size + return torchvision.transforms.functional.center_crop(img, size) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(size={self.size})" + \ No newline at end of file diff --git a/SimCLR/data_aug/gaussian_blur.py b/SimCLR/datasets/data_aug/gaussian_blur.py similarity index 100% rename from SimCLR/data_aug/gaussian_blur.py rename to SimCLR/datasets/data_aug/gaussian_blur.py diff --git a/SimCLR/data_aug/icgan_aug.py b/SimCLR/datasets/data_aug/icgan_aug.py similarity index 100% rename from SimCLR/data_aug/icgan_aug.py rename to SimCLR/datasets/data_aug/icgan_aug.py diff --git a/SimCLR/data_aug/icgan_config.py b/SimCLR/datasets/data_aug/icgan_config.py similarity index 100% rename from SimCLR/data_aug/icgan_config.py rename to SimCLR/datasets/data_aug/icgan_config.py diff --git a/SimCLR/data_aug/rcdm_aug.py b/SimCLR/datasets/data_aug/rcdm_aug.py similarity index 100% rename from SimCLR/data_aug/rcdm_aug.py rename to SimCLR/datasets/data_aug/rcdm_aug.py diff --git a/SimCLR/data_aug/rcdm_config.py b/SimCLR/datasets/data_aug/rcdm_config.py similarity index 100% rename from SimCLR/data_aug/rcdm_config.py rename to SimCLR/datasets/data_aug/rcdm_config.py diff --git a/SimCLR/data_aug/supervised_dataset.py b/SimCLR/datasets/supervised_dataset.py similarity index 91% rename from SimCLR/data_aug/supervised_dataset.py rename to SimCLR/datasets/supervised_dataset.py index 774a113..51efb69 100644 --- a/SimCLR/data_aug/supervised_dataset.py +++ b/SimCLR/datasets/supervised_dataset.py @@ -2,7 +2,7 @@ from torchvision.transforms import transforms from SimCLR.exceptions.exceptions import InvalidDatasetSelection -import random +from SimCLR.datasets.data_aug.center_crop import CostumeCenterCrop class SupervisedDataset: def __init__(self, root_folder): @@ -16,7 +16,8 @@ def get_transform(size): size (int): Image size. """ transform_list = [ - transforms.CenterCrop(size=size), + CostumeCenterCrop(), + transforms.Resize((size, size)), transforms.ToTensor(), ] diff --git a/SimCLR/data_aug/view_generator.py b/SimCLR/datasets/view_generator.py similarity index 100% rename from SimCLR/data_aug/view_generator.py rename to SimCLR/datasets/view_generator.py diff --git a/eval_simclr.slrm b/eval_simclr.slrm index bb70619..0d2164c 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -5,11 +5,11 @@ #SBATCH --time=12:00:00 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=2G -#SBATCH --output=./runs/eval_slurm-%N-%j.out -#SBATCH --error=./runs/eval_slurm-%N-%j.err +#SBATCH --mem=100G +#SBATCH --output=./logs/simclr/eval_slurm-%N-%j.out +#SBATCH --error=./logs/simclr/eval_slurm-%N-%j.err #SBATCH --qos=m PY_ARGS=${@:1} @@ -17,7 +17,7 @@ PY_ARGS=${@:1} # load virtual environment source /ssd003/projects/aieng/envs/genssl2/bin/activate -export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend export CUDA_LAUNCH_BLOCKING=1 export MASTER_ADDR=$(hostname) @@ -31,12 +31,13 @@ files=$(ls checkpoint_epoch_*) # Loop through each file and pass it as a parameter to the rest of the script for file in $files do - # “srun” executes the script times - srun python evaluate_simCLR.py \ + # torchrun execute nproc-per-node * nodes times + torchrun --nnodes 1 --nproc-per-node 4 evaluate_simCLR.py \ --distributed_mode \ + --distributed_launcher="pytorch" \ --batch-size=256 \ --pretrained_model_dir="/projects/imagenet_synthetic/train_models" \ - --experiment_name="simclr/23_02_2024_13:02" \ + --experiment_name="simclr/2024_02_23_13_02" \ --pretrained_model_name=$file \ --linear_evaluation \ --arch="resnet50" diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index a87bb83..36b5d01 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -10,7 +10,7 @@ import shutil from SimCLR import distributed as dist_utils -from SimCLR.data_aug.supervised_dataset import SupervisedDataset +from SimCLR.datasets.supervised_dataset import SupervisedDataset from SimCLR.models.resnet_pretrained import PretrainedResNet @@ -291,6 +291,7 @@ def main(): top5_accuracy /= counter + 1 print( f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", + flush=True, ) if args.enable_checkpointing: checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch) diff --git a/run_simCLR.py b/run_simCLR.py index 02c624a..3d74b29 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -11,7 +11,7 @@ from torchvision import models from SimCLR import distributed as dist_utils -from SimCLR.data_aug.contrastive_learning_dataset import ContrastiveLearningDataset +from SimCLR.datasets.contrastive_learning_dataset import ContrastiveLearningDataset from SimCLR.models.resnet_simclr import ResNetSimCLR from SimCLR.simclr import SimCLR from torch.utils.data import Subset @@ -177,7 +177,7 @@ def main(): # Create a directory to save the model checkpoints and logs now = datetime.now() - dt_string = now.strftime("%d_%m_%Y_%H:%M") + dt_string = now.strftime("%Y_%m_%d_%H_%M") log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string) if not os.path.exists(log_dir): os.makedirs(log_dir) @@ -260,7 +260,6 @@ def main(): scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=len(train_loader), eta_min=0, last_epoch=-1 ) - print(device_id,flush=True) simclr = SimCLR( log_dir=log_dir, model=model, diff --git a/train_simclr.slrm b/train_simclr.slrm index cf0cf25..01f500e 100644 --- a/train_simclr.slrm +++ b/train_simclr.slrm @@ -4,11 +4,11 @@ #SBATCH --partition=a100 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=8G -#SBATCH --output=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.out -#SBATCH --error=/h/sayromlou/GenerativeSSL/logs/simclr/slurm-%N-%j.err +#SBATCH --mem=100G +#SBATCH --output=./logs/simclr/slurm-%N-%j.out +#SBATCH --error=./logs/simclr/slurm-%N-%j.err #SBATCH --qos=a100_arashaf PY_ARGS=${@:1} @@ -16,7 +16,7 @@ PY_ARGS=${@:1} # load virtual environment source /ssd003/projects/aieng/envs/genssl2/bin/activate -export NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend export CUDA_LAUNCH_BLOCKING=1 export MASTER_ADDR=$(hostname) @@ -25,11 +25,12 @@ export MASTER_PORT=45679 export PYTHONPATH="." nvidia-smi -# “srun” executes the script times -srun python run_simCLR.py \ +# torchrun execute nproc-per-node * nodes times +torchrun --nnodes 1 --nproc-per-node 4 run_simCLR.py \ --fp16-precision \ --distributed_mode \ +--distributed_launcher="pytorch" \ --batch-size=256 \ --model_dir="/projects/imagenet_synthetic/train_models" \ --experiment_name="simclr" \ ---arch="resnet50" +--arch="resnet50" \ No newline at end of file From 288f749404861c71cce4e5ca1cf53e631f3b2cb6 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:03:27 -0800 Subject: [PATCH 44/57] update evaluation --- eval_simclr.slrm | 11 ++++++----- evaluate_simCLR.py | 10 +++++----- train_simclr.slrm | 4 ++-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 0d2164c..2e8bb00 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -1,16 +1,15 @@ #!/bin/bash #SBATCH --job-name=train_sunrgbd -#SBATCH --partition=t4v2 -#SBATCH --time=12:00:00 +#SBATCH --partition=a100 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=4 #SBATCH --mem=100G -#SBATCH --output=./logs/simclr/eval_slurm-%N-%j.out -#SBATCH --error=./logs/simclr/eval_slurm-%N-%j.err -#SBATCH --qos=m +#SBATCH --output=logs/simclr/eval_slurm-%N-%j.out +#SBATCH --error=logs/simclr/eval_slurm-%N-%j.err +#SBATCH --qos=a100_arashaf PY_ARGS=${@:1} @@ -31,6 +30,8 @@ files=$(ls checkpoint_epoch_*) # Loop through each file and pass it as a parameter to the rest of the script for file in $files do + echo "Evaluating: $file" + # torchrun execute nproc-per-node * nodes times torchrun --nnodes 1 --nproc-per-node 4 evaluate_simCLR.py \ --distributed_mode \ diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 36b5d01..42e8a19 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -7,7 +7,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 from torch.utils.data.distributed import DistributedSampler from torchvision import models -import shutil +from tqdm import tqdm from SimCLR import distributed as dist_utils from SimCLR.datasets.supervised_dataset import SupervisedDataset @@ -256,9 +256,9 @@ def main(): log_dir = args.pretrained_model_dir - for epoch in range(args.epochs): + for epoch_counter in tqdm(range(args.epochs), desc="Training Progress"): if dist_utils.is_dist_avail_and_initialized(): - train_loader.sampler.set_epoch(epoch) + train_loader.sampler.set_epoch(epoch_counter) top1_train_accuracy = 0 for counter, (x_batch, y_batch) in enumerate(train_loader): x_batch = x_batch.cuda(device_id) @@ -294,10 +294,10 @@ def main(): flush=True, ) if args.enable_checkpointing: - checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch) + checkpoint_name = "checkpoint_supervised_epoch_{:04d}.pth.tar".format(epoch_counter) save_checkpoint( { - "n_epoch": epoch, + "n_epoch": epoch_counter, "arch": args.arch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), diff --git a/train_simclr.slrm b/train_simclr.slrm index 01f500e..b5d0acb 100644 --- a/train_simclr.slrm +++ b/train_simclr.slrm @@ -7,8 +7,8 @@ #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=4 #SBATCH --mem=100G -#SBATCH --output=./logs/simclr/slurm-%N-%j.out -#SBATCH --error=./logs/simclr/slurm-%N-%j.err +#SBATCH --output=logs/simclr/slurm-%N-%j.out +#SBATCH --error=logs/simclr/slurm-%N-%j.err #SBATCH --qos=a100_arashaf PY_ARGS=${@:1} From 3b43d4e277fb59f360b93ffe9ada9c92ea1be017 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:12:43 -0800 Subject: [PATCH 45/57] Update bash file --- eval_simclr.slrm | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index 2e8bb00..eeb9d63 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -25,7 +25,10 @@ export MASTER_PORT=45679 export PYTHONPATH="." nvidia-smi -files=$(ls checkpoint_epoch_*) +pretrained_model_dir="/projects/imagenet_synthetic/train_models" +experiment_name="simclr/2024_02_23_13_02" + +files=$(ls $pretrained_model_dir/$experiment_name/checkpoint_epoch_*) # Loop through each file and pass it as a parameter to the rest of the script for file in $files @@ -37,8 +40,8 @@ do --distributed_mode \ --distributed_launcher="pytorch" \ --batch-size=256 \ - --pretrained_model_dir="/projects/imagenet_synthetic/train_models" \ - --experiment_name="simclr/2024_02_23_13_02" \ + --pretrained_model_dir=$pretrained_model_dir \ + --experiment_name=$experiment_name \ --pretrained_model_name=$file \ --linear_evaluation \ --arch="resnet50" From e64fe5c5bcd01354d64167e4357b2a2b1b311064 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:22:49 -0800 Subject: [PATCH 46/57] edit eval --- eval_simclr.slrm | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index eeb9d63..e2e1d7a 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -28,7 +28,11 @@ nvidia-smi pretrained_model_dir="/projects/imagenet_synthetic/train_models" experiment_name="simclr/2024_02_23_13_02" -files=$(ls $pretrained_model_dir/$experiment_name/checkpoint_epoch_*) +cd $pretrained_model_dir/$experiment_name + +files=$(ls checkpoint_epoch_*) + +cd "$OLDPWD" # Loop through each file and pass it as a parameter to the rest of the script for file in $files From ef6a2147bd0d29d4cedd04de0b2058872be4107d Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:27:59 -0800 Subject: [PATCH 47/57] check loading --- SimCLR/models/resnet_pretrained.py | 1 + 1 file changed, 1 insertion(+) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index 840577a..ef088a0 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -10,6 +10,7 @@ def __init__(self, base_model, pretrained_model_file, linear_eval=True, num_clas super(PretrainedResNet, self).__init__() self.pretrained_model_file = pretrained_model_file + print(self.pretrained_model_file, flush=True) self.resnet_dict = { "resnet18": models.resnet18(pretrained=False, num_classes=num_classes), From cef45728bae2df346aa63dbb31deb0d7321db260 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:41:08 -0800 Subject: [PATCH 48/57] debug eval --- evaluate_simCLR.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 42e8a19..95029b8 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -227,6 +227,7 @@ def main(): elif args.dataset_name == "stl10": num_classes = 10 elif args.dataset_name == "imagenet": + print("Using ImageNet dataset", flush=True) num_classes = 1000 model = PretrainedResNet( @@ -234,6 +235,8 @@ def main(): pretrained_model_file = os.path.join(args.pretrained_model_dir, args.experiment_name, args.pretrained_model_name), linear_eval=args.linear_evaluation, num_classes=num_classes) + + print("loaded model", flush=True) if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): # set the single device scope, otherwise DistributedDataParallel will @@ -254,7 +257,9 @@ def main(): n_iter = 0 - log_dir = args.pretrained_model_dir + log_dir = os.path.joinq(args.pretrained_model_dir, args.experiment_name) + + print(f"log_dir:{log_dir}", flush=True) for epoch_counter in tqdm(range(args.epochs), desc="Training Progress"): if dist_utils.is_dist_avail_and_initialized(): From 883d9c0465d3c289ab122e43722b063b14250f92 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:45:08 -0800 Subject: [PATCH 49/57] update --- evaluate_simCLR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 95029b8..0291e22 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -257,7 +257,7 @@ def main(): n_iter = 0 - log_dir = os.path.joinq(args.pretrained_model_dir, args.experiment_name) + log_dir = os.path.join(args.pretrained_model_dir, args.experiment_name) print(f"log_dir:{log_dir}", flush=True) From c900b9fab5ab522f92e93b7d4c28beb0dc96af45 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:51:32 -0800 Subject: [PATCH 50/57] check evaluation --- evaluate_simCLR.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 0291e22..27f2416 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -261,11 +261,12 @@ def main(): print(f"log_dir:{log_dir}", flush=True) - for epoch_counter in tqdm(range(args.epochs), desc="Training Progress"): + for epoch_counter in tqdm(range(args.epochs), desc="Epoch Progress"): if dist_utils.is_dist_avail_and_initialized(): train_loader.sampler.set_epoch(epoch_counter) top1_train_accuracy = 0 - for counter, (x_batch, y_batch) in enumerate(train_loader): + print(f"epoch:{epoch_counter}", flush=True) + for counter, (x_batch, y_batch) in tqdm(enumerate(train_loader), desc="Training Progress"): x_batch = x_batch.cuda(device_id) y_batch = y_batch.cuda(device_id) @@ -278,11 +279,13 @@ def main(): loss.backward() optimizer.step() n_iter += 1 + if counter % 100 == 0: + print(f"Epoch {epoch_counter}\t Iteration {counter}\t Loss: {loss.item()}", flush=True) top1_train_accuracy /= counter + 1 top1_accuracy = 0 top5_accuracy = 0 - for counter, (x_batch, y_batch) in enumerate(test_loader): + for counter, (x_batch, y_batch) in tqdm(enumerate(test_loader), desc="Evaluation Progress"): x_batch = x_batch.cuda(device_id) y_batch = y_batch.cuda(device_id) From 314633cf1d3186c0fde8bb47be9e63e27262cfb9 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:58:13 -0800 Subject: [PATCH 51/57] Clean the code --- SimCLR/models/resnet_pretrained.py | 1 - evaluate_simCLR.py | 27 ++++++++++----------------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index ef088a0..840577a 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -10,7 +10,6 @@ def __init__(self, base_model, pretrained_model_file, linear_eval=True, num_clas super(PretrainedResNet, self).__init__() self.pretrained_model_file = pretrained_model_file - print(self.pretrained_model_file, flush=True) self.resnet_dict = { "resnet18": models.resnet18(pretrained=False, num_classes=num_classes), diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 27f2416..49f424e 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -227,7 +227,6 @@ def main(): elif args.dataset_name == "stl10": num_classes = 10 elif args.dataset_name == "imagenet": - print("Using ImageNet dataset", flush=True) num_classes = 1000 model = PretrainedResNet( @@ -235,8 +234,6 @@ def main(): pretrained_model_file = os.path.join(args.pretrained_model_dir, args.experiment_name, args.pretrained_model_name), linear_eval=args.linear_evaluation, num_classes=num_classes) - - print("loaded model", flush=True) if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): # set the single device scope, otherwise DistributedDataParallel will @@ -255,18 +252,14 @@ def main(): criterion = torch.nn.CrossEntropyLoss().cuda(device_id) - n_iter = 0 - log_dir = os.path.join(args.pretrained_model_dir, args.experiment_name) - print(f"log_dir:{log_dir}", flush=True) - for epoch_counter in tqdm(range(args.epochs), desc="Epoch Progress"): if dist_utils.is_dist_avail_and_initialized(): train_loader.sampler.set_epoch(epoch_counter) top1_train_accuracy = 0 - print(f"epoch:{epoch_counter}", flush=True) - for counter, (x_batch, y_batch) in tqdm(enumerate(train_loader), desc="Training Progress"): + counter = 0 + for counter, (x_batch, y_batch) in tqdm(train_loader, desc="Training Progress"): x_batch = x_batch.cuda(device_id) y_batch = y_batch.cuda(device_id) @@ -278,14 +271,13 @@ def main(): optimizer.zero_grad() loss.backward() optimizer.step() - n_iter += 1 - if counter % 100 == 0: - print(f"Epoch {epoch_counter}\t Iteration {counter}\t Loss: {loss.item()}", flush=True) + counter += 1 - top1_train_accuracy /= counter + 1 + top1_train_accuracy /= counter top1_accuracy = 0 top5_accuracy = 0 - for counter, (x_batch, y_batch) in tqdm(enumerate(test_loader), desc="Evaluation Progress"): + counter = 0 + for x_batch, y_batch in tqdm(test_loader, desc="Evaluation Progress"): x_batch = x_batch.cuda(device_id) y_batch = y_batch.cuda(device_id) @@ -294,11 +286,12 @@ def main(): top1, top5 = accuracy(logits, y_batch, topk=(1, 5)) top1_accuracy += top1[0] top5_accuracy += top5[0] + counter += 1 - top1_accuracy /= counter + 1 - top5_accuracy /= counter + 1 + top1_accuracy /= counter + top5_accuracy /= counter print( - f"Epoch {n_iter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", + f"Epoch {epoch_counter}\t Top1 Train accuracy {top1_train_accuracy.item()}\tTop1 Test accuracy: {top1_accuracy.item()}\tTop5 test acc: {top5_accuracy.item()}", flush=True, ) if args.enable_checkpointing: From ed78e17d9d2b785aa884c4e6407d1afa16fedb0d Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 12:59:47 -0800 Subject: [PATCH 52/57] update --- evaluate_simCLR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluate_simCLR.py b/evaluate_simCLR.py index 49f424e..cc1cca3 100644 --- a/evaluate_simCLR.py +++ b/evaluate_simCLR.py @@ -259,7 +259,7 @@ def main(): train_loader.sampler.set_epoch(epoch_counter) top1_train_accuracy = 0 counter = 0 - for counter, (x_batch, y_batch) in tqdm(train_loader, desc="Training Progress"): + for x_batch, y_batch in tqdm(train_loader, desc="Training Progress"): x_batch = x_batch.cuda(device_id) y_batch = y_batch.cuda(device_id) From a3926674d2d77dc114baf1803332ccbdbdca4e57 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 13:10:11 -0800 Subject: [PATCH 53/57] try catch the file exist error --- run_simCLR.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/run_simCLR.py b/run_simCLR.py index 3d74b29..cd8cebd 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -179,8 +179,10 @@ def main(): now = datetime.now() dt_string = now.strftime("%Y_%m_%d_%H_%M") log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string) - if not os.path.exists(log_dir): + try: os.makedirs(log_dir) + except FileExistsError: + print(f"Directory {log_dir} made by another worker") # Set the start method to spawn for distributed training torch.multiprocessing.set_start_method("spawn") From 6c5cf20240b64a3dd418b3e16132cc0f37273e37 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 13:10:25 -0800 Subject: [PATCH 54/57] update --- run_simCLR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_simCLR.py b/run_simCLR.py index cd8cebd..c067f63 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -182,7 +182,7 @@ def main(): try: os.makedirs(log_dir) except FileExistsError: - print(f"Directory {log_dir} made by another worker") + print(f"Directory {log_dir} made by another worker", flush=True) # Set the start method to spawn for distributed training torch.multiprocessing.set_start_method("spawn") From 313b7055d39c7bb0ff305c97ac558545bcb92fd3 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Fri, 23 Feb 2024 13:13:48 -0800 Subject: [PATCH 55/57] update logging part --- SimCLR/simclr.py | 11 ++++++++++- run_simCLR.py | 12 ------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index 21a8652..e03ee60 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -1,4 +1,5 @@ import os +from datetime import datetime import torch from torch.cuda.amp import GradScaler, autocast @@ -12,12 +13,20 @@ class SimCLR(object): - def __init__(self, log_dir, *args, **kwargs): + def __init__(self, *args, **kwargs): self.args = kwargs["args"] self.model = kwargs["model"] self.optimizer = kwargs["optimizer"] self.scheduler = kwargs["scheduler"] self.device_id = kwargs["device_id"] + # Create a directory to save the model checkpoints and logs + now = datetime.now() + dt_string = now.strftime("%Y_%m_%d_%H_%M") + log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string) + try: + os.makedirs(log_dir) + except FileExistsError: + print(f"Directory {log_dir} made by another worker", flush=True) self.writer = SummaryWriter(log_dir) self.criterion = loss.SimCLRContrastiveLoss(self.args.temperature).cuda( self.device_id diff --git a/run_simCLR.py b/run_simCLR.py index c067f63..c307027 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -3,8 +3,6 @@ import random from functools import partial -import os -from datetime import datetime import torch from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 from torch.utils.data.distributed import DistributedSampler @@ -175,15 +173,6 @@ def main(): args = parser.parse_args() print(args) - # Create a directory to save the model checkpoints and logs - now = datetime.now() - dt_string = now.strftime("%Y_%m_%d_%H_%M") - log_dir = os.path.join(args.model_dir, args.experiment_name,dt_string) - try: - os.makedirs(log_dir) - except FileExistsError: - print(f"Directory {log_dir} made by another worker", flush=True) - # Set the start method to spawn for distributed training torch.multiprocessing.set_start_method("spawn") @@ -263,7 +252,6 @@ def main(): optimizer, T_max=len(train_loader), eta_min=0, last_epoch=-1 ) simclr = SimCLR( - log_dir=log_dir, model=model, optimizer=optimizer, scheduler=scheduler, From 67b5a9f0d3be56cd42645528ff67703f556948d9 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Sun, 25 Feb 2024 10:05:19 -0800 Subject: [PATCH 56/57] update slrm scripts --- eval_simclr.slrm | 14 ++++++-------- train_simclr.slrm | 8 +++----- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/eval_simclr.slrm b/eval_simclr.slrm index e2e1d7a..6c4b627 100644 --- a/eval_simclr.slrm +++ b/eval_simclr.slrm @@ -1,15 +1,15 @@ #!/bin/bash #SBATCH --job-name=train_sunrgbd -#SBATCH --partition=a100 +#SBATCH --partition=t4v2 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem=100G #SBATCH --output=logs/simclr/eval_slurm-%N-%j.out #SBATCH --error=logs/simclr/eval_slurm-%N-%j.err -#SBATCH --qos=a100_arashaf +#SBATCH --qos=m PY_ARGS=${@:1} @@ -39,15 +39,13 @@ for file in $files do echo "Evaluating: $file" - # torchrun execute nproc-per-node * nodes times - torchrun --nnodes 1 --nproc-per-node 4 evaluate_simCLR.py \ + # srun execute ntasks-per-node * nodes times + srun python evaluate_simCLR.py \ --distributed_mode \ - --distributed_launcher="pytorch" \ --batch-size=256 \ --pretrained_model_dir=$pretrained_model_dir \ --experiment_name=$experiment_name \ --pretrained_model_name=$file \ - --linear_evaluation \ - --arch="resnet50" + --linear_evaluation # Add your processing logic here done \ No newline at end of file diff --git a/train_simclr.slrm b/train_simclr.slrm index b5d0acb..0a352eb 100644 --- a/train_simclr.slrm +++ b/train_simclr.slrm @@ -25,12 +25,10 @@ export MASTER_PORT=45679 export PYTHONPATH="." nvidia-smi -# torchrun execute nproc-per-node * nodes times -torchrun --nnodes 1 --nproc-per-node 4 run_simCLR.py \ +# srun execute ntasks-per-node * nodes times +srun pythong run_simCLR.py \ --fp16-precision \ --distributed_mode \ ---distributed_launcher="pytorch" \ --batch-size=256 \ --model_dir="/projects/imagenet_synthetic/train_models" \ ---experiment_name="simclr" \ ---arch="resnet50" \ No newline at end of file +--experiment_name="simclr" \ No newline at end of file From 1e2ba8142c309ca905b5e579db835a9c99c5f719 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Tue, 27 Feb 2024 17:43:15 -0800 Subject: [PATCH 57/57] update resnet pretrained --- SimCLR/models/resnet_pretrained.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SimCLR/models/resnet_pretrained.py b/SimCLR/models/resnet_pretrained.py index 840577a..bc37e85 100644 --- a/SimCLR/models/resnet_pretrained.py +++ b/SimCLR/models/resnet_pretrained.py @@ -34,7 +34,7 @@ def _load_pretrained(self): state_dict = checkpoint["state_dict"] for k in list(state_dict.keys()): if k.startswith("module.backbone."): - if k.startswith("module.backbone") and not k.startswith("module.backbone.fc"): + if not k.startswith("module.backbone.fc"): # remove prefix state_dict[k[len("module.backbone.") :]] = state_dict[k] del state_dict[k]