From 0eeb86e19fc91f8295ac43230e94ac5d50811f41 Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Tue, 27 Feb 2024 17:43:15 -0500 Subject: [PATCH 01/38] Add simsiam. --- SimCLR/data_aug/imagenet_synthetic_dataset.py | 39 +++++++++++++----- SimCLR/simclr.py | 20 ++++++++-- run_simCLR.py | 12 ++++++ simsiam | 1 + train_simclr.slrm | 10 +++-- train_simsiam.slrm | 40 +++++++++++++++++++ 6 files changed, 106 insertions(+), 16 deletions(-) create mode 160000 simsiam create mode 100644 train_simsiam.slrm diff --git a/SimCLR/data_aug/imagenet_synthetic_dataset.py b/SimCLR/data_aug/imagenet_synthetic_dataset.py index 66c245e..7c05a6c 100644 --- a/SimCLR/data_aug/imagenet_synthetic_dataset.py +++ b/SimCLR/data_aug/imagenet_synthetic_dataset.py @@ -3,6 +3,7 @@ import os import random +import torch from PIL import Image from torchvision import datasets, transforms @@ -37,6 +38,7 @@ def __init__( imagenet_synthetic_root, index_min=0, index_max=9, + generative_augmentation_prob=None, load_one_real_image=False, split="train", ): @@ -48,6 +50,7 @@ def __init__( self.imagenet_synthetic_root = imagenet_synthetic_root self.index_min = index_min self.index_max = index_max + self.generative_augmentation_prob = generative_augmentation_prob self.load_one_real_image = load_one_real_image self.synthetic_transforms = _get_simclr_transforms(size=224) self.real_transforms = _get_simclr_transforms(size=224, random_crop=True) @@ -62,21 +65,37 @@ def _synthetic_image(filename): filename_parent_dir = filename.split("/")[-2] image_path = os.path.join( self.imagenet_synthetic_root, - # self.split, + self.split, filename_parent_dir, filename_and_extension.split(".")[0] + f"_{rand_int}.JPEG", ) return Image.open(image_path).convert("RGB") - if self.load_one_real_image: - image1 = self.loader(os.path.join(self.root, imagenet_filename)) - image1 = self.real_transforms(image1) - else: - image1 = _synthetic_image(imagenet_filename) - image1 = self.synthetic_transforms(image1) + if self.generative_augmentation_prob is not None: + if torch.rand(1) < self.generative_augmentation_prob: + # Generate a synthetic image. + image1 = _synthetic_image(imagenet_filename) + image1 = self.synthetic_transforms(image1) + else: + image1 = self.loader(os.path.join(self.root, imagenet_filename)) + image1 = self.real_transforms(image1) - # image2 is always synthetic. - image2 = _synthetic_image(imagenet_filename) - image2 = self.synthetic_transforms(image2) + if torch.rand(1) < self.generative_augmentation_prob: + # Generate another synthetic image. + image2 = _synthetic_image(imagenet_filename) + image2 = self.synthetic_transforms(image2) + else: + image2 = self.loader(os.path.join(self.root, imagenet_filename)) + image2 = self.real_transforms(image2) + else: + if self.load_one_real_image: + image1 = self.loader(os.path.join(self.root, imagenet_filename)) + image1 = self.real_transforms(image1) + else: + image1 = _synthetic_image(imagenet_filename) + image1 = self.synthetic_transforms(image1) + # image2 is always synthetic. + image2 = _synthetic_image(imagenet_filename) + image2 = self.synthetic_transforms(image2) return {"view1": image1, "view2": image2}, label diff --git a/SimCLR/simclr.py b/SimCLR/simclr.py index 8178066..39dbc31 100644 --- a/SimCLR/simclr.py +++ b/SimCLR/simclr.py @@ -23,6 +23,17 @@ def __init__(self, *args, **kwargs): self.device_id, ) self.checkpoint_dir = self.args.checkpoint_dir + self.start_epoch = 0 + + if self.args.last_checkpoint: + checkpoint = torch.load(self.args.last_checkpoint) + self.model.load_state_dict(checkpoint["state_dict"]) + self.optimizer.load_state_dict(checkpoint["optimizer"]) + # Start from the next epoch. + self.start_epoch = checkpoint["epoch"] + 1 + print( + f"Checkpoint loaded. Resuming training from epoch: {self.start_epoch}" + ) def train(self, train_loader): scaler = GradScaler(enabled=self.args.fp16_precision) @@ -32,9 +43,12 @@ def train(self, train_loader): print(f"Log dir: {self.writer.log_dir}") n_iter = 0 - print(f"Start SimCLR training for {self.args.epochs} epochs.") + print( + f"Start SimCLR training for {self.args.epochs} epochs starting from {self.start_epoch}." + ) - for epoch_counter in tqdm(range(self.args.epochs), desc="Training Progress"): + train_range = range(self.start_epoch, self.args.epochs) + for epoch_counter in tqdm(train_range, desc="Training Progress"): if dist_utils.is_dist_avail_and_initialized(): train_loader.sampler.set_epoch(epoch_counter) for images, _ in tqdm(train_loader): @@ -77,7 +91,7 @@ def train(self, train_loader): checkpoint_file = os.path.join(self.checkpoint_dir, checkpoint_name) save_checkpoint( { - "epoch": self.args.epochs, + "epoch": epoch_counter, "arch": self.args.arch, "state_dict": self.model.state_dict(), "optimizer": self.optimizer.state_dict(), diff --git a/run_simCLR.py b/run_simCLR.py index b7ee380..072b73b 100644 --- a/run_simCLR.py +++ b/run_simCLR.py @@ -177,6 +177,17 @@ type=int, help="Synthetic data files are named filename_i.JPEG. This index determines the upper bound for i.", ) +parser.add_argument( + "--last_checkpoint", + default="", + help="Last model checkpoint file to resume training from.", +) +parser.add_argument( + "--generative_augmentation_prob", + default=None, + type=float, + help="The probability of applying a generative model augmentation to a view. Applies to the views separately.", +) def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: @@ -231,6 +242,7 @@ def main(): args.synthetic_data_dir, index_min=args.synthetic_index_min, index_max=args.synthetic_index_max, + generative_augmentation_prob=args.generative_augmentation_prob, ) else: print(f"Using real data for training at {args.data}.") diff --git a/simsiam b/simsiam new file mode 160000 index 0000000..a7bc177 --- /dev/null +++ b/simsiam @@ -0,0 +1 @@ +Subproject commit a7bc1772896d0dad0806c51f0bb6f3b16d290468 diff --git a/train_simclr.slrm b/train_simclr.slrm index 6a03dca..76800d6 100644 --- a/train_simclr.slrm +++ b/train_simclr.slrm @@ -1,6 +1,6 @@ #!/bin/bash -#SBATCH --job-name=simclr_base +#SBATCH --job-name=simclr_icgan #SBATCH --partition=a100 #SBATCH --qos=a100_arashaf #SBATCH --time=72:00:00 @@ -9,7 +9,7 @@ #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu=4G -#SBATCH --output=slurm-%N-%j.out +#SBATCH --output=slurm-%j.out PY_ARGS=${@:1} @@ -31,4 +31,8 @@ srun python run_simCLR.py \ --distributed_mode \ --batch-size=512 \ --epochs=100 \ ---no-use_synthetic_data \ No newline at end of file +--use_synthetic_data \ +--synthetic_data_dir="/projects/imagenet_synthetic/synthetic_icgan" \ +--synthetic_index_min=0 \ +--synthetic_index_max=4 \ +--generative_augmentation_prob=0.5 diff --git a/train_simsiam.slrm b/train_simsiam.slrm new file mode 100644 index 0000000..4d5db7f --- /dev/null +++ b/train_simsiam.slrm @@ -0,0 +1,40 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_icgan" +#SBATCH --partition=a100 +#SBATCH --qos=a100_arashaf +#SBATCH --time=72:00:00 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=4G +#SBATCH --output=slurm-%j.out + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/main_simsiam.py \ +-a resnet50 \ +--fix-pred-lr \ +--distributed_mode \ +--batch-size=256 \ +--epochs=100 \ +--use_synthetic_data \ +--synthetic_data_dir="/projects/imagenet_synthetic/synthetic_icgan" \ +--synthetic_index_min=0 \ +--synthetic_index_max=4 \ +--generative_augmentation_prob=0.5 +--experiment="simsiam_icgan" \ From 260c98b7c87197594b4a621f99ee7317b75f29ab Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Tue, 27 Feb 2024 17:49:42 -0500 Subject: [PATCH 02/38] Adding simsiam files. --- simsiam | 1 - simsiam/LICENSE | 399 +++++++++++++++++++++++++++++++ simsiam/README.md | 96 ++++++++ simsiam/__init__.py | 0 simsiam/builder.py | 61 +++++ simsiam/loader.py | 126 ++++++++++ simsiam/main_lincls.py | 516 ++++++++++++++++++++++++++++++++++++++++ simsiam/main_simsiam.py | 357 +++++++++++++++++++++++++++ 8 files changed, 1555 insertions(+), 1 deletion(-) delete mode 160000 simsiam create mode 100644 simsiam/LICENSE create mode 100644 simsiam/README.md create mode 100644 simsiam/__init__.py create mode 100644 simsiam/builder.py create mode 100644 simsiam/loader.py create mode 100755 simsiam/main_lincls.py create mode 100755 simsiam/main_simsiam.py diff --git a/simsiam b/simsiam deleted file mode 160000 index a7bc177..0000000 --- a/simsiam +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a7bc1772896d0dad0806c51f0bb6f3b16d290468 diff --git a/simsiam/LICENSE b/simsiam/LICENSE new file mode 100644 index 0000000..105a4fb --- /dev/null +++ b/simsiam/LICENSE @@ -0,0 +1,399 @@ +Attribution-NonCommercial 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-NonCommercial 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. NonCommercial means not primarily intended for or directed towards + commercial advantage or monetary compensation. For purposes of + this Public License, the exchange of the Licensed Material for + other material subject to Copyright and Similar Rights by digital + file-sharing or similar means is NonCommercial provided there is + no payment of monetary compensation in connection with the + exchange. + + j. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + k. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + l. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part, for NonCommercial purposes only; and + + b. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database for NonCommercial purposes + only; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/simsiam/README.md b/simsiam/README.md new file mode 100644 index 0000000..47bab1b --- /dev/null +++ b/simsiam/README.md @@ -0,0 +1,96 @@ +# SimSiam: Exploring Simple Siamese Representation Learning + +

+ simsiam +

+ +This is a PyTorch implementation of the [SimSiam paper](https://arxiv.org/abs/2011.10566): +``` +@Article{chen2020simsiam, + author = {Xinlei Chen and Kaiming He}, + title = {Exploring Simple Siamese Representation Learning}, + journal = {arXiv preprint arXiv:2011.10566}, + year = {2020}, +} +``` + +### Preparation + +Install PyTorch and download the ImageNet dataset following the [official PyTorch ImageNet training code](https://github.com/pytorch/examples/tree/master/imagenet). Similar to [MoCo](https://github.com/facebookresearch/moco), the code release contains minimal modifications for both unsupervised pre-training and linear classification to that code. + +In addition, install [apex](https://github.com/NVIDIA/apex) for the [LARS](https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py) implementation needed for linear classification. + +### Unsupervised Pre-Training + +Only **multi-gpu**, **DistributedDataParallel** training is supported; single-gpu or DataParallel training is not supported. + +To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu machine, run: +``` +python main_simsiam.py \ + -a resnet50 \ + --dist-url 'tcp://localhost:10001' --multiprocessing-distributed --world-size 1 --rank 0 \ + --fix-pred-lr \ + [your imagenet-folder with train and val folders] +``` +The script uses all the default hyper-parameters as described in the paper, and uses the default augmentation recipe from [MoCo v2](https://arxiv.org/abs/2003.04297). + +The above command performs pre-training with a non-decaying predictor learning rate for 100 epochs, corresponding to the last row of Table 1 in the paper. + +### Linear Classification + +With a pre-trained model, to train a supervised linear classifier on frozen features/weights in an 8-gpu machine, run: +``` +python main_lincls.py \ + -a resnet50 \ + --dist-url 'tcp://localhost:10001' --multiprocessing-distributed --world-size 1 --rank 0 \ + --pretrained [your checkpoint path]/checkpoint_0099.pth.tar \ + --lars \ + [your imagenet-folder with train and val folders] +``` + +The above command uses LARS optimizer and a default batch size of 4096. + +### Models and Logs + +Our pre-trained ResNet-50 models and logs: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
pre-train
epochs
batch
size
pre-train
ckpt
pre-train
log
linear cls.
ckpt
linear cls.
log
top-1 acc.
100512linklinklinklink68.1
100256linklinklinklink68.3
+ +Settings for the above: 8 NVIDIA V100 GPUs, CUDA 10.1/CuDNN 7.6.5, PyTorch 1.7.0. + +### Transferring to Object Detection + +Same as [MoCo](https://github.com/facebookresearch/moco) for object detection transfer, please see [moco/detection](https://github.com/facebookresearch/moco/tree/master/detection). + + +### License + +This project is under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for details. \ No newline at end of file diff --git a/simsiam/__init__.py b/simsiam/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/simsiam/builder.py b/simsiam/builder.py new file mode 100644 index 0000000..7ca8c50 --- /dev/null +++ b/simsiam/builder.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +class SimSiam(nn.Module): + """ + Build a SimSiam model. + """ + def __init__(self, base_encoder, dim=2048, pred_dim=512): + """ + dim: feature dimension (default: 2048) + pred_dim: hidden dimension of the predictor (default: 512) + """ + super(SimSiam, self).__init__() + + # create the encoder + # num_classes is the output fc dimension, zero-initialize last BNs + self.encoder = base_encoder(num_classes=dim, zero_init_residual=True) + + # build a 3-layer projector + prev_dim = self.encoder.fc.weight.shape[1] + self.encoder.fc = nn.Sequential(nn.Linear(prev_dim, prev_dim, bias=False), + nn.BatchNorm1d(prev_dim), + nn.ReLU(inplace=True), # first layer + nn.Linear(prev_dim, prev_dim, bias=False), + nn.BatchNorm1d(prev_dim), + nn.ReLU(inplace=True), # second layer + self.encoder.fc, + nn.BatchNorm1d(dim, affine=False)) # output layer + self.encoder.fc[6].bias.requires_grad = False # hack: not use bias as it is followed by BN + + # build a 2-layer predictor + self.predictor = nn.Sequential(nn.Linear(dim, pred_dim, bias=False), + nn.BatchNorm1d(pred_dim), + nn.ReLU(inplace=True), # hidden layer + nn.Linear(pred_dim, dim)) # output layer + + def forward(self, x1, x2): + """ + Input: + x1: first views of images + x2: second views of images + Output: + p1, p2, z1, z2: predictors and targets of the network + See Sec. 3 of https://arxiv.org/abs/2011.10566 for detailed notations + """ + + # compute features for one view + z1 = self.encoder(x1) # NxC + z2 = self.encoder(x2) # NxC + + p1 = self.predictor(z1) # NxC + p2 = self.predictor(z2) # NxC + + return p1, p2, z1.detach(), z2.detach() diff --git a/simsiam/loader.py b/simsiam/loader.py new file mode 100644 index 0000000..69a33ac --- /dev/null +++ b/simsiam/loader.py @@ -0,0 +1,126 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +import random + +import torch +from PIL import Image, ImageFilter +from torchvision import datasets, transforms + + +class GaussianBlur(object): + """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709.""" + + def __init__(self, sigma=[0.1, 2.0]): + self.sigma = sigma + + def __call__(self, x): + sigma = random.uniform(self.sigma[0], self.sigma[1]) + x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) + return x + + +_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + +# MoCo v2's aug: similar to SimCLR https://arxiv.org/abs/2002.05709 +_real_augmentations = [ + transforms.RandomResizedCrop(224, scale=(0.2, 1.0)), + transforms.RandomApply( + [ + transforms.ColorJitter(0.4, 0.4, 0.4, 0.1) # not strengthened + ], + p=0.8, + ), + transforms.RandomGrayscale(p=0.2), + transforms.RandomApply([GaussianBlur([0.1, 2.0])], p=0.5), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + _normalize, +] + + +class TwoCropsTransform: + """Take two random crops of one image as the query and key.""" + + def __init__(self): + self.base_transform = transforms.Compose(_real_augmentations) + + def __call__(self, x): + q = self.base_transform(x) + k = self.base_transform(x) + return [q, k] + + +class ImageNetSynthetic(datasets.ImageNet): + def __init__( + self, + imagenet_root, + imagenet_synthetic_root, + index_min=0, + index_max=9, + generative_augmentation_prob=None, + load_one_real_image=False, + split="train", + ): + super(ImageNetSynthetic, self).__init__( + root=imagenet_root, + split=split, + ) + self.imagenet_root = imagenet_root + self.imagenet_synthetic_root = imagenet_synthetic_root + self.index_min = index_min + self.index_max = index_max + self.generative_augmentation_prob = generative_augmentation_prob + self.load_one_real_image = load_one_real_image + self.real_transforms = transforms.Compose(_real_augmentations) + # Remove random crop for synthetic image augmentation. + self.synthetic_transforms = transforms.Compose(_real_augmentations[1:]) + self.split = split + + def __getitem__(self, index): + imagenet_filename, label = self.imgs[index] + + def _synthetic_image(filename): + rand_int = random.randint(self.index_min, self.index_max) + filename_and_extension = filename.split("/")[-1] + filename_parent_dir = filename.split("/")[-2] + image_path = os.path.join( + self.imagenet_synthetic_root, + self.split, + filename_parent_dir, + filename_and_extension.split(".")[0] + f"_{rand_int}.JPEG", + ) + return Image.open(image_path).convert("RGB") + + if self.generative_augmentation_prob is not None: + if torch.rand(1) < self.generative_augmentation_prob: + # Generate a synthetic image. + image1 = _synthetic_image(imagenet_filename) + image1 = self.synthetic_transforms(image1) + else: + image1 = self.loader(os.path.join(self.root, imagenet_filename)) + image1 = self.real_transforms(image1) + + if torch.rand(1) < self.generative_augmentation_prob: + # Generate another synthetic image. + image2 = _synthetic_image(imagenet_filename) + image2 = self.synthetic_transforms(image2) + else: + image2 = self.loader(os.path.join(self.root, imagenet_filename)) + image2 = self.real_transforms(image2) + else: + if self.load_one_real_image: + image1 = self.loader(os.path.join(self.root, imagenet_filename)) + image1 = self.real_transforms(image1) + else: + image1 = _synthetic_image(imagenet_filename) + image1 = self.synthetic_transforms(image1) + # image2 is always synthetic. + image2 = _synthetic_image(imagenet_filename) + image2 = self.synthetic_transforms(image2) + + return [image1, image2], label diff --git a/simsiam/main_lincls.py b/simsiam/main_lincls.py new file mode 100755 index 0000000..ed4e208 --- /dev/null +++ b/simsiam/main_lincls.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import builtins +import math +import os +import random +import shutil +import time +import warnings + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +import torch.multiprocessing as mp +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet50)') +parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', + help='number of data loading workers (default: 32)') +parser.add_argument('--epochs', default=90, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=4096, type=int, + metavar='N', + help='mini-batch size (default: 4096), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial (base) learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=0., type=float, + metavar='W', help='weight decay (default: 0.)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') + +# additional configs: +parser.add_argument('--pretrained', default='', type=str, + help='path to simsiam pretrained checkpoint') +parser.add_argument('--lars', action='store_true', + help='Use LARS') + +best_acc1 = 0 + + +def main(): + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + args.gpu = gpu + + # suppress printing if not master + if args.multiprocessing_distributed and args.gpu != 0: + def print_pass(*args): + pass + builtins.print = print_pass + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + # create model + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ['fc.weight', 'fc.bias']: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained: + if os.path.isfile(args.pretrained): + print("=> loading checkpoint '{}'".format(args.pretrained)) + checkpoint = torch.load(args.pretrained, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint['state_dict'] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith('module.encoder') and not k.startswith('module.encoder.fc'): + # remove prefix + state_dict[k[len("module.encoder."):]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + args.start_epoch = 0 + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + + print("=> loaded pre-trained model '{}'".format(args.pretrained)) + else: + print("=> no checkpoint found at '{}'".format(args.pretrained)) + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = torch.nn.DataParallel(model).cuda() + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + optimizer = torch.optim.SGD(parameters, init_lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + if args.lars: + print("=> use LARS optimizer.") + from apex.parallel.LARC import LARC + optimizer = LARC(optimizer=optimizer, trust_coefficient=.001, clip=False) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=256, shuffle=False, + num_workers=args.workers, pin_memory=True) + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + }, is_best) + if epoch == args.start_epoch: + sanity_check(model.state_dict(), args.pretrained) + + +def train(train_loader, model, criterion, optimizer, epoch, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + +def validate(val_loader, model, criterion, args): + batch_time = AverageMeter('Time', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + # TODO: this should also be done with the ProgressMeter + print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + + +def sanity_check(state_dict, pretrained_weights): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print("=> loading '{}' for sanity check".format(pretrained_weights)) + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint['state_dict'] + + for k in list(state_dict.keys()): + # only ignore fc layer + if 'fc.weight' in k or 'fc.bias' in k: + continue + + # name in pretrained model + k_pre = 'module.encoder.' + k[len('module.'):] \ + if k.startswith('module.') else 'module.encoder.' + k + + assert ((state_dict[k].cpu() == state_dict_pre[k_pre]).all()), \ + '{} is changed in linear classifier training.'.format(k) + + print("=> sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group['lr'] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() diff --git a/simsiam/main_simsiam.py b/simsiam/main_simsiam.py new file mode 100755 index 0000000..4055b7e --- /dev/null +++ b/simsiam/main_simsiam.py @@ -0,0 +1,357 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import math +import os +import random +from datetime import datetime +from functools import partial + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +from torch import nn +from torch.backends import cudnn +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torch.utils.data.distributed import DistributedSampler +from torchvision import datasets, models +from tqdm import tqdm + +from SimCLR import distributed as dist_utils +from simsiam import builder, loader + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data_dir", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=100, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=256, + type=int, + metavar="N", + help="mini-batch size (default: 512), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.05, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument( + "--momentum", default=0.9, type=float, metavar="M", help="momentum of SGD solver" +) +parser.add_argument( + "--wd", + "--weight-decay", + default=1e-4, + type=float, + metavar="W", + help="weight decay (default: 1e-4)", + dest="weight_decay", +) +parser.add_argument( + "--resume_from_checkpoint", + default="", + type=str, + help="Path to latest checkpoint.", +) +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) + +# simsiam specific configs: +parser.add_argument( + "--dim", default=2048, type=int, help="feature dimension (default: 2048)" +) +parser.add_argument( + "--pred-dim", + default=512, + type=int, + help="hidden dimension of the predictor (default: 512)", +) +parser.add_argument( + "--fix-pred-lr", action="store_true", help="Fix learning rate for the predictor" +) + +parser.add_argument( + "--distributed_mode", + action="store_true", + help="Enable distributed training", +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument( + "--checkpoint_dir", + default="/projects/imagenet_synthetic/model_checkpoints", + help="Checkpoint root directory.", +) +parser.add_argument( + "--experiment", + default="", + help="Experiment name.", +) +parser.add_argument( + "--use_synthetic_data", + action=argparse.BooleanOptionalAction, + help="Whether to use real data or synthetic data for training.", +) +parser.add_argument( + "--synthetic_data_dir", + default="/projects/imagenet_synthetic/synthetic_icgan", + help="Path to the root of synthetic data.", +) +parser.add_argument( + "--synthetic_index_min", + default=0, + type=int, + help="Synthetic data files are named filename_i.JPEG. This index determines the lower bound for i.", +) +parser.add_argument( + "--synthetic_index_max", + default=9, + type=int, + help="Synthetic data files are named filename_i.JPEG. This index determines the upper bound for i.", +) +parser.add_argument( + "--generative_augmentation_prob", + default=None, + type=float, + help="The probability of applying a generative model augmentation to a view. Applies to the views separately.", +) + + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + + +def main(): + args = parser.parse_args() + current_time = datetime.now().strftime("%Y-%m-%d-%H-%M") + checkpoint_subdir = ( + f"{args.experiment}_{current_time}" if args.experiment else f"{current_time}" + ) + args.checkpoint_dir = os.path.join(args.checkpoint_dir, checkpoint_subdir) + os.makedirs(args.checkpoint_dir, exist_ok=True) + + print(args) + + torch.multiprocessing.set_start_method("spawn") + if args.distributed_mode: + dist_utils.init_distributed_mode( + launcher=args.distributed_launcher, + backend=args.distributed_backend, + ) + device_id = torch.cuda.current_device() + else: + device_id = None + + # Data loading. + if args.use_synthetic_data: + print( + f"Using synthetic data for training at {args.synthetic_data_dir} between indices {args.synthetic_index_min} and {args.synthetic_index_max}." + ) + train_dataset = loader.ImageNetSynthetic( + args.data_dir, + args.synthetic_data_dir, + index_min=args.synthetic_index_min, + index_max=args.synthetic_index_max, + generative_augmentation_prob=args.generative_augmentation_prob, + ) + else: + print(f"Using real data for training at {args.data_dir}.") + train_data_dir = os.path.join(args.data_dir, "train") + train_dataset = datasets.ImageFolder(train_data_dir, loader.TwoCropsTransform()) + + train_sampler = None + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = DistributedSampler( + train_dataset, + seed=args.seed, + drop_last=True, + ) + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=False, + drop_last=True, + ) + + print(f"Creating model {args.arch}") + model = builder.SimSiam(models.__dict__[args.arch], args.dim, args.pred_dim) + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + # Apply SyncBN + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + # set the single device scope, otherwise DistributedDataParallel will + # use all available devices + torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + raise NotImplementedError("Only DistributedDataParallel is supported.") + print(model) # print model after SyncBatchNorm + + # define loss function (criterion) and optimizer + criterion = nn.CosineSimilarity(dim=1).cuda(device_id) + + if args.fix_pred_lr: + optim_params = [ + {"params": model.module.encoder.parameters(), "fix_lr": False}, + {"params": model.module.predictor.parameters(), "fix_lr": True}, + ] + else: + optim_params = model.parameters() + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256.0 + optimizer = torch.optim.SGD( + optim_params, + init_lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + + start_epoch = 0 + # Optionally resume from a checkpoint + if args.resume_from_checkpoint: + if os.path.isfile(args.resume_from_checkpoint): + print(f"Loading checkpoint: {args.resume_from_checkpoint}") + checkpoint = torch.load(args.resume_from_checkpoint) + start_epoch = checkpoint["epoch"] + 1 + model.load_state_dict(checkpoint["state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + print(f"Loaded checkpoint {args.resume_from_checkpoint} successfully.") + else: + raise ValueError(f"No checkpoint found at: {args.resume_from_checkpoint}") + + cudnn.benchmark = True + + for epoch in range(start_epoch, args.epochs): + print(f"Starting training epoch: {epoch}") + if dist_utils.is_dist_avail_and_initialized(): + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, device_id, args) + + # Checkpointing. + if dist_utils.get_rank() == 0: + checkpoint_name = f"checkpoint_{epoch}.pth.tar" + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), + }, + filename=checkpoint_file, + ) + + +def train(train_loader, model, criterion, optimizer, epoch, device_id, args): + """Single epoch training code.""" + # switch to train mode + model.train() + + # for i, (images, _) in enumerate(train_loader): + for images, _ in tqdm(train_loader): + images[0] = images[0].cuda(device_id, non_blocking=True) + images[1] = images[1].cuda(device_id, non_blocking=True) + + # compute output and loss + p1, p2, z1, z2 = model(x1=images[0], x2=images[1]) + loss = -(criterion(p1, z2).mean() + criterion(p2, z1).mean()) * 0.5 + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def save_checkpoint(state, filename="checkpoint.pth.tar"): + """Save state dictionary into a model checkpoint.""" + print(f"Saving checkpoint at: {filename}") + torch.save(state, filename) + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule.""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + if "fix_lr" in param_group and param_group["fix_lr"]: + param_group["lr"] = init_lr + else: + param_group["lr"] = cur_lr + + +if __name__ == "__main__": + main() From 0f1943710c38c74d9bbbde0023d9dac2e2fc43b7 Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Wed, 28 Feb 2024 17:02:04 -0500 Subject: [PATCH 03/38] Add SimSiam eval script. --- eval_simsiam.slrm | 34 +++ simsiam/linear_eval.py | 524 ++++++++++++++++++++++++++++++++++++++++ simsiam/main_lincls.py | 516 --------------------------------------- simsiam/main_simsiam.py | 2 +- train_simsiam.slrm | 19 +- 5 files changed, 571 insertions(+), 524 deletions(-) create mode 100644 eval_simsiam.slrm create mode 100755 simsiam/linear_eval.py delete mode 100755 simsiam/main_lincls.py diff --git a/eval_simsiam.slrm b/eval_simsiam.slrm new file mode 100644 index 0000000..5479793 --- /dev/null +++ b/eval_simsiam.slrm @@ -0,0 +1,34 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval_baseline" +#SBATCH --partition=t4v2 +#SBATCH --time=08:00:00 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=4G +#SBATCH --output=slurm-%j.out + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval.py \ +--data_dir="/scratch/ssd004/datasets/imagenet256" \ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=256 \ +--epochs=100 \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-27-16-48/checkpoint_22.pth.tar" diff --git a/simsiam/linear_eval.py b/simsiam/linear_eval.py new file mode 100755 index 0000000..c4cf05a --- /dev/null +++ b/simsiam/linear_eval.py @@ -0,0 +1,524 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import math +import os +import random +import shutil +import time +from functools import partial + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +from torch import nn +from torch.backends import cudnn +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torchvision import datasets, models, transforms + +from SimCLR import distributed as dist_utils + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data_dir", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=4096, + type=int, + metavar="N", + help="mini-batch size (default: 4096), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) +parser.add_argument( + "-e", + "--evaluate", + dest="evaluate", + action="store_true", + help="Whether to only evaluate model on validation set (no downstream training).", +) +parser.add_argument( + "--distributed_mode", + action="store_true", + help="Enable distributed training", +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) +parser.add_argument( + "--pretrained_checkpoint", + default="", + type=str, + help="Path to simsiam pretrained checkpoint.", +) +parser.add_argument("--lars", action="store_true", help="Use LARS") +parser.add_argument( + "--checkpoint_dir", + default="", + help="Checkpoint directory to save eval model checkpoints.", +) + + +best_acc1 = 0 + + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + + +def main(): + args = parser.parse_args() + global best_acc1 + + torch.multiprocessing.set_start_method("spawn") + if args.distributed_mode: + dist_utils.init_distributed_mode( + launcher=args.distributed_launcher, + backend=args.distributed_backend, + ) + device_id = torch.cuda.current_device() + else: + device_id = None + + # create model + print(f"Creating model {args.arch}") + model = models.__dict__[args.arch]() + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained_checkpoint: + if os.path.isfile(args.pretrained_checkpoint): + print(f"Loading checkpoint {args.pretrained_checkpoint}") + checkpoint = torch.load(args.pretrained_checkpoint, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith("module.encoder") and not k.startswith( + "module.encoder.fc" + ): + # remove prefix + state_dict[k[len("module.encoder.") :]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + else: + raise ValueError(f"No checkpoint found at: {args.pretrained_checkpoint}") + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + raise NotImplementedError("Only DistributedDataParallel is supported.") + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(device_id) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + optimizer = torch.optim.SGD( + parameters, init_lr, momentum=args.momentum, weight_decay=args.weight_decay + ) + if args.lars: + print("Use LARS optimizer.") + from apex.parallel.LARC import LARC + + optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) + + cudnn.benchmark = True + + # Data loading code + train_dir = os.path.join(args.data_dir, "train") + val_dir = os.path.join(args.data_dir, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + train_dataset = datasets.ImageFolder( + train_dir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ), + ) + + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, + seed=args.seed, + ) + else: + train_sampler = None + + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=True, # TODO(arashaf): this was set to false in training script. + ) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder( + val_dir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ), + ), + batch_size=256, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + ) + + # if args.evaluate: + # validate(val_loader, model, criterion, device_id, args) + # return + + for epoch in range(args.epochs): + print(f"Starting training epoch: {epoch}") + if dist_utils.is_dist_avail_and_initialized(): + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, device_id, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, device_id, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if args.checkpoint_dir and dist_utils.get_rank() == 0: + os.makedirs(args.checkpoint_dir, exist_ok=True) + checkpoint_name = "eval_checkpoint_{:04d}.pth.tar".format(epoch) + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + checkpoint_file, + ) + if epoch == 0: + sanity_check(model.state_dict(), args.pretrained_checkpoint) + + +def train(train_loader, model, criterion, optimizer, epoch, device_id, args): + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch), + ) + + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + +def validate(val_loader, model, criterion, device_id, args): + batch_time = AverageMeter("Time", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " + ) + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + # TODO: this should also be done with the ProgressMeter + print( + " * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5) + ) + + return top1.avg + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + print(f"Saving checkpoint at: {filename}") + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") + + +def sanity_check(state_dict, pretrained_weights): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print(f"Loading {pretrained_weights} for sanity check") + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint["state_dict"] + + for k in list(state_dict.keys()): + # only ignore fc layer + if "fc.weight" in k or "fc.bias" in k: + continue + + # name in pretrained model + k_pre = ( + "module.encoder." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder." + k + ) + + assert ( + state_dict[k].cpu() == state_dict_pre[k_pre] + ).all(), "{} is changed in linear classifier training.".format(k) + + print("Sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group["lr"] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() diff --git a/simsiam/main_lincls.py b/simsiam/main_lincls.py deleted file mode 100755 index ed4e208..0000000 --- a/simsiam/main_lincls.py +++ /dev/null @@ -1,516 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import builtins -import math -import os -import random -import shutil -import time -import warnings - -import torch -import torch.nn as nn -import torch.nn.parallel -import torch.backends.cudnn as cudnn -import torch.distributed as dist -import torch.optim -import torch.multiprocessing as mp -import torch.utils.data -import torch.utils.data.distributed -import torchvision.transforms as transforms -import torchvision.datasets as datasets -import torchvision.models as models - -model_names = sorted(name for name in models.__dict__ - if name.islower() and not name.startswith("__") - and callable(models.__dict__[name])) - -parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') -parser.add_argument('data', metavar='DIR', - help='path to dataset') -parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', - choices=model_names, - help='model architecture: ' + - ' | '.join(model_names) + - ' (default: resnet50)') -parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', - help='number of data loading workers (default: 32)') -parser.add_argument('--epochs', default=90, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--start-epoch', default=0, type=int, metavar='N', - help='manual epoch number (useful on restarts)') -parser.add_argument('-b', '--batch-size', default=4096, type=int, - metavar='N', - help='mini-batch size (default: 4096), this is the total ' - 'batch size of all GPUs on the current node when ' - 'using Data Parallel or Distributed Data Parallel') -parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, - metavar='LR', help='initial (base) learning rate', dest='lr') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum') -parser.add_argument('--wd', '--weight-decay', default=0., type=float, - metavar='W', help='weight decay (default: 0.)', - dest='weight_decay') -parser.add_argument('-p', '--print-freq', default=10, type=int, - metavar='N', help='print frequency (default: 10)') -parser.add_argument('--resume', default='', type=str, metavar='PATH', - help='path to latest checkpoint (default: none)') -parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', - help='evaluate model on validation set') -parser.add_argument('--world-size', default=-1, type=int, - help='number of nodes for distributed training') -parser.add_argument('--rank', default=-1, type=int, - help='node rank for distributed training') -parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, - help='url used to set up distributed training') -parser.add_argument('--dist-backend', default='nccl', type=str, - help='distributed backend') -parser.add_argument('--seed', default=None, type=int, - help='seed for initializing training. ') -parser.add_argument('--gpu', default=None, type=int, - help='GPU id to use.') -parser.add_argument('--multiprocessing-distributed', action='store_true', - help='Use multi-processing distributed training to launch ' - 'N processes per node, which has N GPUs. This is the ' - 'fastest way to use PyTorch for either single node or ' - 'multi node data parallel training') - -# additional configs: -parser.add_argument('--pretrained', default='', type=str, - help='path to simsiam pretrained checkpoint') -parser.add_argument('--lars', action='store_true', - help='Use LARS') - -best_acc1 = 0 - - -def main(): - args = parser.parse_args() - - if args.seed is not None: - random.seed(args.seed) - torch.manual_seed(args.seed) - cudnn.deterministic = True - warnings.warn('You have chosen to seed training. ' - 'This will turn on the CUDNN deterministic setting, ' - 'which can slow down your training considerably! ' - 'You may see unexpected behavior when restarting ' - 'from checkpoints.') - - if args.gpu is not None: - warnings.warn('You have chosen a specific GPU. This will completely ' - 'disable data parallelism.') - - if args.dist_url == "env://" and args.world_size == -1: - args.world_size = int(os.environ["WORLD_SIZE"]) - - args.distributed = args.world_size > 1 or args.multiprocessing_distributed - - ngpus_per_node = torch.cuda.device_count() - if args.multiprocessing_distributed: - # Since we have ngpus_per_node processes per node, the total world_size - # needs to be adjusted accordingly - args.world_size = ngpus_per_node * args.world_size - # Use torch.multiprocessing.spawn to launch distributed processes: the - # main_worker process function - mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) - else: - # Simply call main_worker function - main_worker(args.gpu, ngpus_per_node, args) - - -def main_worker(gpu, ngpus_per_node, args): - global best_acc1 - args.gpu = gpu - - # suppress printing if not master - if args.multiprocessing_distributed and args.gpu != 0: - def print_pass(*args): - pass - builtins.print = print_pass - - if args.gpu is not None: - print("Use GPU: {} for training".format(args.gpu)) - - if args.distributed: - if args.dist_url == "env://" and args.rank == -1: - args.rank = int(os.environ["RANK"]) - if args.multiprocessing_distributed: - # For multiprocessing distributed training, rank needs to be the - # global rank among all the processes - args.rank = args.rank * ngpus_per_node + gpu - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - torch.distributed.barrier() - # create model - print("=> creating model '{}'".format(args.arch)) - model = models.__dict__[args.arch]() - - # freeze all layers but the last fc - for name, param in model.named_parameters(): - if name not in ['fc.weight', 'fc.bias']: - param.requires_grad = False - # init the fc layer - model.fc.weight.data.normal_(mean=0.0, std=0.01) - model.fc.bias.data.zero_() - - # load from pre-trained, before DistributedDataParallel constructor - if args.pretrained: - if os.path.isfile(args.pretrained): - print("=> loading checkpoint '{}'".format(args.pretrained)) - checkpoint = torch.load(args.pretrained, map_location="cpu") - - # rename moco pre-trained keys - state_dict = checkpoint['state_dict'] - for k in list(state_dict.keys()): - # retain only encoder up to before the embedding layer - if k.startswith('module.encoder') and not k.startswith('module.encoder.fc'): - # remove prefix - state_dict[k[len("module.encoder."):]] = state_dict[k] - # delete renamed or unused k - del state_dict[k] - - args.start_epoch = 0 - msg = model.load_state_dict(state_dict, strict=False) - assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} - - print("=> loaded pre-trained model '{}'".format(args.pretrained)) - else: - print("=> no checkpoint found at '{}'".format(args.pretrained)) - - # infer learning rate before changing batch size - init_lr = args.lr * args.batch_size / 256 - - if args.distributed: - # For multiprocessing distributed, DistributedDataParallel constructor - # should always set the single device scope, otherwise, - # DistributedDataParallel will use all available devices. - if args.gpu is not None: - torch.cuda.set_device(args.gpu) - model.cuda(args.gpu) - # When using a single GPU per process and per - # DistributedDataParallel, we need to divide the batch size - # ourselves based on the total number of GPUs we have - args.batch_size = int(args.batch_size / ngpus_per_node) - args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) - else: - model.cuda() - # DistributedDataParallel will divide and allocate batch_size to all - # available GPUs if device_ids are not set - model = torch.nn.parallel.DistributedDataParallel(model) - elif args.gpu is not None: - torch.cuda.set_device(args.gpu) - model = model.cuda(args.gpu) - else: - # DataParallel will divide and allocate batch_size to all available GPUs - if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): - model.features = torch.nn.DataParallel(model.features) - model.cuda() - else: - model = torch.nn.DataParallel(model).cuda() - - # define loss function (criterion) and optimizer - criterion = nn.CrossEntropyLoss().cuda(args.gpu) - - # optimize only the linear classifier - parameters = list(filter(lambda p: p.requires_grad, model.parameters())) - assert len(parameters) == 2 # fc.weight, fc.bias - - optimizer = torch.optim.SGD(parameters, init_lr, - momentum=args.momentum, - weight_decay=args.weight_decay) - if args.lars: - print("=> use LARS optimizer.") - from apex.parallel.LARC import LARC - optimizer = LARC(optimizer=optimizer, trust_coefficient=.001, clip=False) - - # optionally resume from a checkpoint - if args.resume: - if os.path.isfile(args.resume): - print("=> loading checkpoint '{}'".format(args.resume)) - if args.gpu is None: - checkpoint = torch.load(args.resume) - else: - # Map model to be loaded to specified single gpu. - loc = 'cuda:{}'.format(args.gpu) - checkpoint = torch.load(args.resume, map_location=loc) - args.start_epoch = checkpoint['epoch'] - best_acc1 = checkpoint['best_acc1'] - if args.gpu is not None: - # best_acc1 may be from a checkpoint from a different GPU - best_acc1 = best_acc1.to(args.gpu) - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - print("=> loaded checkpoint '{}' (epoch {})" - .format(args.resume, checkpoint['epoch'])) - else: - print("=> no checkpoint found at '{}'".format(args.resume)) - - cudnn.benchmark = True - - # Data loading code - traindir = os.path.join(args.data, 'train') - valdir = os.path.join(args.data, 'val') - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - - train_dataset = datasets.ImageFolder( - traindir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - - if args.distributed: - train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) - else: - train_sampler = None - - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler) - - val_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(valdir, transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])), - batch_size=256, shuffle=False, - num_workers=args.workers, pin_memory=True) - - if args.evaluate: - validate(val_loader, model, criterion, args) - return - - for epoch in range(args.start_epoch, args.epochs): - if args.distributed: - train_sampler.set_epoch(epoch) - adjust_learning_rate(optimizer, init_lr, epoch, args) - - # train for one epoch - train(train_loader, model, criterion, optimizer, epoch, args) - - # evaluate on validation set - acc1 = validate(val_loader, model, criterion, args) - - # remember best acc@1 and save checkpoint - is_best = acc1 > best_acc1 - best_acc1 = max(acc1, best_acc1) - - if not args.multiprocessing_distributed or (args.multiprocessing_distributed - and args.rank % ngpus_per_node == 0): - save_checkpoint({ - 'epoch': epoch + 1, - 'arch': args.arch, - 'state_dict': model.state_dict(), - 'best_acc1': best_acc1, - 'optimizer' : optimizer.state_dict(), - }, is_best) - if epoch == args.start_epoch: - sanity_check(model.state_dict(), args.pretrained) - - -def train(train_loader, model, criterion, optimizer, epoch, args): - batch_time = AverageMeter('Time', ':6.3f') - data_time = AverageMeter('Data', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - progress = ProgressMeter( - len(train_loader), - [batch_time, data_time, losses, top1, top5], - prefix="Epoch: [{}]".format(epoch)) - - """ - Switch to eval mode: - Under the protocol of linear classification on frozen features/models, - it is not legitimate to change any part of the pre-trained model. - BatchNorm in train mode may revise running mean/std (even if it receives - no gradient), which are part of the model parameters too. - """ - model.eval() - - end = time.time() - for i, (images, target) in enumerate(train_loader): - # measure data loading time - data_time.update(time.time() - end) - - if args.gpu is not None: - images = images.cuda(args.gpu, non_blocking=True) - target = target.cuda(args.gpu, non_blocking=True) - - # compute output - output = model(images) - loss = criterion(output, target) - - # measure accuracy and record loss - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # compute gradient and do SGD step - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - progress.display(i) - - -def validate(val_loader, model, criterion, args): - batch_time = AverageMeter('Time', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - progress = ProgressMeter( - len(val_loader), - [batch_time, losses, top1, top5], - prefix='Test: ') - - # switch to evaluate mode - model.eval() - - with torch.no_grad(): - end = time.time() - for i, (images, target) in enumerate(val_loader): - if args.gpu is not None: - images = images.cuda(args.gpu, non_blocking=True) - target = target.cuda(args.gpu, non_blocking=True) - - # compute output - output = model(images) - loss = criterion(output, target) - - # measure accuracy and record loss - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - progress.display(i) - - # TODO: this should also be done with the ProgressMeter - print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' - .format(top1=top1, top5=top5)) - - return top1.avg - - -def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): - torch.save(state, filename) - if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') - - -def sanity_check(state_dict, pretrained_weights): - """ - Linear classifier should not change any weights other than the linear layer. - This sanity check asserts nothing wrong happens (e.g., BN stats updated). - """ - print("=> loading '{}' for sanity check".format(pretrained_weights)) - checkpoint = torch.load(pretrained_weights, map_location="cpu") - state_dict_pre = checkpoint['state_dict'] - - for k in list(state_dict.keys()): - # only ignore fc layer - if 'fc.weight' in k or 'fc.bias' in k: - continue - - # name in pretrained model - k_pre = 'module.encoder.' + k[len('module.'):] \ - if k.startswith('module.') else 'module.encoder.' + k - - assert ((state_dict[k].cpu() == state_dict_pre[k_pre]).all()), \ - '{} is changed in linear classifier training.'.format(k) - - print("=> sanity check passed.") - - -class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - -class ProgressMeter(object): - def __init__(self, num_batches, meters, prefix=""): - self.batch_fmtstr = self._get_batch_fmtstr(num_batches) - self.meters = meters - self.prefix = prefix - - def display(self, batch): - entries = [self.prefix + self.batch_fmtstr.format(batch)] - entries += [str(meter) for meter in self.meters] - print('\t'.join(entries)) - - def _get_batch_fmtstr(self, num_batches): - num_digits = len(str(num_batches // 1)) - fmt = '{:' + str(num_digits) + 'd}' - return '[' + fmt + '/' + fmt.format(num_batches) + ']' - - -def adjust_learning_rate(optimizer, init_lr, epoch, args): - """Decay the learning rate based on schedule""" - cur_lr = init_lr * 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) - for param_group in optimizer.param_groups: - param_group['lr'] = cur_lr - - -def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - -if __name__ == '__main__': - main() diff --git a/simsiam/main_simsiam.py b/simsiam/main_simsiam.py index 4055b7e..e486c4b 100755 --- a/simsiam/main_simsiam.py +++ b/simsiam/main_simsiam.py @@ -304,7 +304,7 @@ def main(): # Checkpointing. if dist_utils.get_rank() == 0: - checkpoint_name = f"checkpoint_{epoch}.pth.tar" + checkpoint_name = "checkpoint_{:04d}.pth.tar".format(epoch) checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) save_checkpoint( { diff --git a/train_simsiam.slrm b/train_simsiam.slrm index 4d5db7f..4c8f46d 100644 --- a/train_simsiam.slrm +++ b/train_simsiam.slrm @@ -1,6 +1,6 @@ #!/bin/bash -#SBATCH --job-name="simsiam_icgan" +#SBATCH --job-name="simsiam_baseline" #SBATCH --partition=a100 #SBATCH --qos=a100_arashaf #SBATCH --time=72:00:00 @@ -32,9 +32,14 @@ srun python simsiam/main_simsiam.py \ --distributed_mode \ --batch-size=256 \ --epochs=100 \ ---use_synthetic_data \ ---synthetic_data_dir="/projects/imagenet_synthetic/synthetic_icgan" \ ---synthetic_index_min=0 \ ---synthetic_index_max=4 \ ---generative_augmentation_prob=0.5 ---experiment="simsiam_icgan" \ +--experiment="simsiam_baseline" \ +--resume_from_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-27-16-48/checkpoint_22.pth.tar" + + + + +# --use_synthetic_data \ +# --synthetic_data_dir="/projects/imagenet_synthetic/synthetic_icgan" \ +# --synthetic_index_min=0 \ +# --synthetic_index_max=4 \ +# --generative_augmentation_prob=0.5 \ From c724279ec917fd7f38396f4c78fb23ea4539f8ad Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Wed, 28 Feb 2024 21:05:26 -0500 Subject: [PATCH 04/38] Add slurm files for eval runs. --- eval_simsiam_baseline_30.slrm | 35 +++++++++++++++++++ ...simsiam.slrm => eval_simsiam_icgan_30.slrm | 9 ++--- eval_simsiam_stablediff_30.slrm | 35 +++++++++++++++++++ 3 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 eval_simsiam_baseline_30.slrm rename eval_simsiam.slrm => eval_simsiam_icgan_30.slrm (82%) create mode 100644 eval_simsiam_stablediff_30.slrm diff --git a/eval_simsiam_baseline_30.slrm b/eval_simsiam_baseline_30.slrm new file mode 100644 index 0000000..0e69c30 --- /dev/null +++ b/eval_simsiam_baseline_30.slrm @@ -0,0 +1,35 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=a40 +#SBATCH --time=12:00:00 +#SBATCH --qos=m +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=4G +#SBATCH --output=slurm-%j.out + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval.py \ +--data_dir="/scratch/ssd004/datasets/imagenet256" \ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=256 \ +--epochs=100 \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-28-15-16/checkpoint_0030.pth.tar" diff --git a/eval_simsiam.slrm b/eval_simsiam_icgan_30.slrm similarity index 82% rename from eval_simsiam.slrm rename to eval_simsiam_icgan_30.slrm index 5479793..8689f9e 100644 --- a/eval_simsiam.slrm +++ b/eval_simsiam_icgan_30.slrm @@ -1,8 +1,9 @@ #!/bin/bash -#SBATCH --job-name="simsiam_eval_baseline" -#SBATCH --partition=t4v2 -#SBATCH --time=08:00:00 +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=a40 +#SBATCH --time=12:00:00 +#SBATCH --qos=m #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=4 @@ -31,4 +32,4 @@ srun python simsiam/linear_eval.py \ --distributed_mode \ --batch-size=256 \ --epochs=100 \ ---pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-27-16-48/checkpoint_22.pth.tar" +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_2024-02-28-15-14/checkpoint_0030.pth.tar" diff --git a/eval_simsiam_stablediff_30.slrm b/eval_simsiam_stablediff_30.slrm new file mode 100644 index 0000000..5961bc2 --- /dev/null +++ b/eval_simsiam_stablediff_30.slrm @@ -0,0 +1,35 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=a40 +#SBATCH --time=12:00:00 +#SBATCH --qos=m +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=4G +#SBATCH --output=slurm-%j.out + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval.py \ +--data_dir="/scratch/ssd004/datasets/imagenet256" \ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=256 \ +--epochs=100 \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-28-15-11/checkpoint_0030.pth.tar" From 19dbce512b95d3b353f0a464a6de81eb87df1c23 Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Thu, 29 Feb 2024 10:20:29 -0500 Subject: [PATCH 05/38] Added tqdm. --- simsiam/linear_eval.py | 92 +++++------------------------------------- 1 file changed, 9 insertions(+), 83 deletions(-) diff --git a/simsiam/linear_eval.py b/simsiam/linear_eval.py index c4cf05a..9ff8fbf 100755 --- a/simsiam/linear_eval.py +++ b/simsiam/linear_eval.py @@ -9,7 +9,6 @@ import os import random import shutil -import time from functools import partial import torch @@ -21,6 +20,7 @@ from torch.backends import cudnn from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 from torchvision import datasets, models, transforms +from tqdm import tqdm from SimCLR import distributed as dist_utils @@ -86,21 +86,6 @@ help="weight decay (default: 0.)", dest="weight_decay", ) -parser.add_argument( - "-p", - "--print-freq", - default=10, - type=int, - metavar="N", - help="print frequency (default: 10)", -) -parser.add_argument( - "-e", - "--evaluate", - dest="evaluate", - action="store_true", - help="Whether to only evaluate model on validation set (no downstream training).", -) parser.add_argument( "--distributed_mode", action="store_true", @@ -239,7 +224,7 @@ def main(): transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, - ] + ], ), ) @@ -276,7 +261,7 @@ def main(): transforms.CenterCrop(224), transforms.ToTensor(), normalize, - ] + ], ), ), batch_size=256, @@ -285,11 +270,7 @@ def main(): pin_memory=True, ) - # if args.evaluate: - # validate(val_loader, model, criterion, device_id, args) - # return - - for epoch in range(args.epochs): + for epoch in tqdm(range(args.epochs)): print(f"Starting training epoch: {epoch}") if dist_utils.is_dist_avail_and_initialized(): train_sampler.set_epoch(epoch) @@ -325,17 +306,6 @@ def main(): def train(train_loader, model, criterion, optimizer, epoch, device_id, args): - batch_time = AverageMeter("Time", ":6.3f") - data_time = AverageMeter("Data", ":6.3f") - losses = AverageMeter("Loss", ":.4e") - top1 = AverageMeter("Acc@1", ":6.2f") - top5 = AverageMeter("Acc@5", ":6.2f") - progress = ProgressMeter( - len(train_loader), - [batch_time, data_time, losses, top1, top5], - prefix="Epoch: [{}]".format(epoch), - ) - """ Switch to eval mode: Under the protocol of linear classification on frozen features/models, @@ -345,11 +315,7 @@ def train(train_loader, model, criterion, optimizer, epoch, device_id, args): """ model.eval() - end = time.time() - for i, (images, target) in enumerate(train_loader): - # measure data loading time - data_time.update(time.time() - end) - + for images, target in tqdm(train_loader): images = images.cuda(device_id, non_blocking=True) target = target.cuda(device_id, non_blocking=True) @@ -359,38 +325,22 @@ def train(train_loader, model, criterion, optimizer, epoch, device_id, args): # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - progress.display(i) - def validate(val_loader, model, criterion, device_id, args): - batch_time = AverageMeter("Time", ":6.3f") - losses = AverageMeter("Loss", ":.4e") top1 = AverageMeter("Acc@1", ":6.2f") top5 = AverageMeter("Acc@5", ":6.2f") - progress = ProgressMeter( - len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " - ) # switch to evaluate mode model.eval() with torch.no_grad(): - end = time.time() - for i, (images, target) in enumerate(val_loader): + for _, (images, target) in enumerate(val_loader): images = images.cuda(device_id, non_blocking=True) target = target.cuda(device_id, non_blocking=True) @@ -400,20 +350,13 @@ def validate(val_loader, model, criterion, device_id, args): # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - progress.display(i) - - # TODO: this should also be done with the ProgressMeter print( - " * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5) + "Validation Accuracy@1 {top1.avg:.3f}, Accuracy@5 {top5.avg:.3f}".format( + top1=top1, top5=top5 + ) ) return top1.avg @@ -479,23 +422,6 @@ def __str__(self): return fmtstr.format(**self.__dict__) -class ProgressMeter(object): - def __init__(self, num_batches, meters, prefix=""): - self.batch_fmtstr = self._get_batch_fmtstr(num_batches) - self.meters = meters - self.prefix = prefix - - def display(self, batch): - entries = [self.prefix + self.batch_fmtstr.format(batch)] - entries += [str(meter) for meter in self.meters] - print("\t".join(entries)) - - def _get_batch_fmtstr(self, num_batches): - num_digits = len(str(num_batches // 1)) - fmt = "{:" + str(num_digits) + "d}" - return "[" + fmt + "/" + fmt.format(num_batches) + "]" - - def adjust_learning_rate(optimizer, init_lr, epoch, args): """Decay the learning rate based on schedule""" cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) From a2dfc9f30eac92766651420989fd82671506e06b Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Thu, 29 Feb 2024 14:53:32 -0500 Subject: [PATCH 06/38] Modified eval scripts to use epoch 50 checkpoint. --- ...aseline_30.slrm => eval_simsiam_baseline_50.slrm | 12 ++++++------ ...siam_icgan_30.slrm => eval_simsiam_icgan_50.slrm | 12 ++++++------ ...ediff_30.slrm => eval_simsiam_stablediff_50.slrm | 13 +++++++------ simsiam/linear_eval.py | 2 +- 4 files changed, 20 insertions(+), 19 deletions(-) rename eval_simsiam_baseline_30.slrm => eval_simsiam_baseline_50.slrm (80%) rename eval_simsiam_icgan_30.slrm => eval_simsiam_icgan_50.slrm (80%) rename eval_simsiam_stablediff_30.slrm => eval_simsiam_stablediff_50.slrm (80%) diff --git a/eval_simsiam_baseline_30.slrm b/eval_simsiam_baseline_50.slrm similarity index 80% rename from eval_simsiam_baseline_30.slrm rename to eval_simsiam_baseline_50.slrm index 0e69c30..11f5c5e 100644 --- a/eval_simsiam_baseline_30.slrm +++ b/eval_simsiam_baseline_50.slrm @@ -2,13 +2,13 @@ #SBATCH --job-name="simsiam_eval" #SBATCH --partition=a40 -#SBATCH --time=12:00:00 -#SBATCH --qos=m -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=2 +#SBATCH --gres=gpu:8 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=4G +#SBATCH --mem-per-cpu=8G #SBATCH --output=slurm-%j.out PY_ARGS=${@:1} @@ -32,4 +32,4 @@ srun python simsiam/linear_eval.py \ --distributed_mode \ --batch-size=256 \ --epochs=100 \ ---pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-28-15-16/checkpoint_0030.pth.tar" +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-29-09-27/checkpoint_0050.pth.tar" diff --git a/eval_simsiam_icgan_30.slrm b/eval_simsiam_icgan_50.slrm similarity index 80% rename from eval_simsiam_icgan_30.slrm rename to eval_simsiam_icgan_50.slrm index 8689f9e..de9ce1a 100644 --- a/eval_simsiam_icgan_30.slrm +++ b/eval_simsiam_icgan_50.slrm @@ -2,13 +2,13 @@ #SBATCH --job-name="simsiam_eval" #SBATCH --partition=a40 -#SBATCH --time=12:00:00 -#SBATCH --qos=m -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=2 +#SBATCH --gres=gpu:8 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=4G +#SBATCH --mem-per-cpu=8G #SBATCH --output=slurm-%j.out PY_ARGS=${@:1} @@ -32,4 +32,4 @@ srun python simsiam/linear_eval.py \ --distributed_mode \ --batch-size=256 \ --epochs=100 \ ---pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_2024-02-28-15-14/checkpoint_0030.pth.tar" +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_2024-02-29-09-30/checkpoint_0050.pth.tar" diff --git a/eval_simsiam_stablediff_30.slrm b/eval_simsiam_stablediff_50.slrm similarity index 80% rename from eval_simsiam_stablediff_30.slrm rename to eval_simsiam_stablediff_50.slrm index 5961bc2..6b46de6 100644 --- a/eval_simsiam_stablediff_30.slrm +++ b/eval_simsiam_stablediff_50.slrm @@ -2,15 +2,16 @@ #SBATCH --job-name="simsiam_eval" #SBATCH --partition=a40 -#SBATCH --time=12:00:00 -#SBATCH --qos=m -#SBATCH --nodes=1 -#SBATCH --gres=gpu:4 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=2 +#SBATCH --gres=gpu:8 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=4G +#SBATCH --mem-per-cpu=8G #SBATCH --output=slurm-%j.out + PY_ARGS=${@:1} # load virtual environment @@ -32,4 +33,4 @@ srun python simsiam/linear_eval.py \ --distributed_mode \ --batch-size=256 \ --epochs=100 \ ---pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-28-15-11/checkpoint_0030.pth.tar" +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-29-09-33/checkpoint_0050.pth.tar" diff --git a/simsiam/linear_eval.py b/simsiam/linear_eval.py index 9ff8fbf..a088d2e 100755 --- a/simsiam/linear_eval.py +++ b/simsiam/linear_eval.py @@ -340,7 +340,7 @@ def validate(val_loader, model, criterion, device_id, args): model.eval() with torch.no_grad(): - for _, (images, target) in enumerate(val_loader): + for images, target in tqdm(val_loader): images = images.cuda(device_id, non_blocking=True) target = target.cuda(device_id, non_blocking=True) From 9c17abb864a0c4906e0dbebd8ed54f90456136fe Mon Sep 17 00:00:00 2001 From: fforghani Date: Fri, 1 Mar 2024 16:05:32 -0500 Subject: [PATCH 07/38] Add Lars to linear evaluation --- simsiam/LARC.py | 107 +++++++++++++++++++++++++++++++++++++++++ simsiam/linear_eval.py | 3 +- 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 simsiam/LARC.py diff --git a/simsiam/LARC.py b/simsiam/LARC.py new file mode 100644 index 0000000..709f2e0 --- /dev/null +++ b/simsiam/LARC.py @@ -0,0 +1,107 @@ +import torch +from torch import nn +from torch.nn.parameter import Parameter + +class LARC(object): + """ + :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC, + in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive + local learning rate for each individual parameter. The algorithm is designed to improve + convergence of large batch training. + + See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate. + + In practice it modifies the gradients of parameters as a proxy for modifying the learning rate + of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer. + + ``` + model = ... + optim = torch.optim.Adam(model.parameters(), lr=...) + optim = LARC(optim) + ``` + + It can even be used in conjunction with apex.fp16_utils.FP16_optimizer. + + ``` + model = ... + optim = torch.optim.Adam(model.parameters(), lr=...) + optim = LARC(optim) + optim = apex.fp16_utils.FP16_Optimizer(optim) + ``` + + Args: + optimizer: Pytorch optimizer to wrap and modify learning rate for. + trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888 + clip: Decides between clipping or scaling mode of LARC. If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter. If `clip=False` the learning rate is set to `local_lr*optimizer_lr`. + eps: epsilon kludge to help with numerical stability while calculating adaptive_lr + """ + + def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8): + self.optim = optimizer + self.trust_coefficient = trust_coefficient + self.eps = eps + self.clip = clip + + def __getstate__(self): + return self.optim.__getstate__() + + def __setstate__(self, state): + self.optim.__setstate__(state) + + @property + def state(self): + return self.optim.state + + def __repr__(self): + return self.optim.__repr__() + + @property + def param_groups(self): + return self.optim.param_groups + + @param_groups.setter + def param_groups(self, value): + self.optim.param_groups = value + + def state_dict(self): + return self.optim.state_dict() + + def load_state_dict(self, state_dict): + self.optim.load_state_dict(state_dict) + + def zero_grad(self): + self.optim.zero_grad() + + def add_param_group(self, param_group): + self.optim.add_param_group( param_group) + + def step(self): + with torch.no_grad(): + weight_decays = [] + for group in self.optim.param_groups: + # absorb weight decay control from optimizer + weight_decay = group['weight_decay'] if 'weight_decay' in group else 0 + weight_decays.append(weight_decay) + group['weight_decay'] = 0 + for p in group['params']: + if p.grad is None: + continue + param_norm = torch.norm(p.data) + grad_norm = torch.norm(p.grad.data) + + if param_norm != 0 and grad_norm != 0: + # calculate adaptive lr + weight decay + adaptive_lr = self.trust_coefficient * (param_norm) / (grad_norm + param_norm * weight_decay + self.eps) + + # clip learning rate for LARC + if self.clip: + # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)` + adaptive_lr = min(adaptive_lr/group['lr'], 1) + + p.grad.data += weight_decay * p.data + p.grad.data *= adaptive_lr + + self.optim.step() + # return weight decay control to optimizer + for i, group in enumerate(self.optim.param_groups): + group['weight_decay'] = weight_decays[i] \ No newline at end of file diff --git a/simsiam/linear_eval.py b/simsiam/linear_eval.py index a088d2e..bce384a 100755 --- a/simsiam/linear_eval.py +++ b/simsiam/linear_eval.py @@ -203,7 +203,8 @@ def main(): ) if args.lars: print("Use LARS optimizer.") - from apex.parallel.LARC import LARC + # from apex.parallel.LARC import LARC + from LARC import LARC optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) From eb8964b9981b27b2925c32778372dfa86313801a Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Fri, 1 Mar 2024 16:09:10 -0500 Subject: [PATCH 08/38] Added a linear eval scrip that prints out the original logs --- simsiam/linear_eval_original_logs.py | 517 +++++++++++++++++++++++++++ 1 file changed, 517 insertions(+) create mode 100755 simsiam/linear_eval_original_logs.py diff --git a/simsiam/linear_eval_original_logs.py b/simsiam/linear_eval_original_logs.py new file mode 100755 index 0000000..34dd262 --- /dev/null +++ b/simsiam/linear_eval_original_logs.py @@ -0,0 +1,517 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +import argparse +import math +import os +import random +import shutil +import time +from functools import partial + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +from torch import nn +from torch.backends import cudnn +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torchvision import datasets, models, transforms +from tqdm import tqdm + +from SimCLR import distributed as dist_utils +from simsiam.lars_optimizer import LARC + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data_dir", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=4096, + type=int, + metavar="N", + help="mini-batch size (default: 4096), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument( + "--distributed_mode", + action="store_true", + help="Enable distributed training", +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) +parser.add_argument( + "--pretrained_checkpoint", + default="", + type=str, + help="Path to simsiam pretrained checkpoint.", +) +parser.add_argument("--lars", action="store_true", help="Use LARS") +parser.add_argument( + "--checkpoint_dir", + default="", + help="Checkpoint directory to save eval model checkpoints.", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) + +best_acc1 = 0 + + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + + +def main(): + args = parser.parse_args() + global best_acc1 + + torch.multiprocessing.set_start_method("spawn") + if args.distributed_mode: + dist_utils.init_distributed_mode( + launcher=args.distributed_launcher, + backend=args.distributed_backend, + ) + device_id = torch.cuda.current_device() + else: + device_id = None + + # create model + print(f"Creating model {args.arch}") + model = models.__dict__[args.arch]() + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained_checkpoint: + if os.path.isfile(args.pretrained_checkpoint): + print(f"Loading checkpoint {args.pretrained_checkpoint}") + checkpoint = torch.load(args.pretrained_checkpoint, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith("module.encoder") and not k.startswith( + "module.encoder.fc" + ): + # remove prefix + state_dict[k[len("module.encoder.") :]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + else: + raise ValueError(f"No checkpoint found at: {args.pretrained_checkpoint}") + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + raise NotImplementedError("Only DistributedDataParallel is supported.") + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(device_id) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + # TODO(arashaf): Enable Adam optimizer + optimizer = torch.optim.SGD( + parameters, + init_lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + if args.lars: + print("Use LARS optimizer.") + LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) + + cudnn.benchmark = True + + # Data loading code + train_dir = os.path.join(args.data_dir, "train") + val_dir = os.path.join(args.data_dir, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + train_dataset = datasets.ImageFolder( + train_dir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, + seed=args.seed, + ) + else: + train_sampler = None + + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=True, # TODO(arashaf): this was set to false in training script. + ) + + val_dataset = datasets.ImageFolder( + val_dir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ) + val_loader = torch.utils.data.DataLoader( + val_dataset, + batch_size=256, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + ) + + for epoch in tqdm(range(args.epochs)): + print(f"Starting training epoch: {epoch}") + if dist_utils.is_dist_avail_and_initialized(): + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, device_id, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, device_id, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if args.checkpoint_dir and dist_utils.get_rank() == 0: + os.makedirs(args.checkpoint_dir, exist_ok=True) + checkpoint_name = "eval_checkpoint_{:04d}.pth.tar".format(epoch) + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + checkpoint_file, + ) + if epoch == 0: + sanity_check(model.state_dict(), args.pretrained_checkpoint) + + +def train(train_loader, model, criterion, optimizer, epoch, device_id, args): + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch), + ) + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + +def validate(val_loader, model, criterion, device_id, args): + batch_time = AverageMeter("Time", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " + ) + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + print( + "Validation Accuracy@1 {top1.avg:.3f}, Accuracy@5 {top5.avg:.3f}".format( + top1=top1, + top5=top5, + ) + ) + + return top1.avg + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + print(f"Saving checkpoint at: {filename}") + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") + + +def sanity_check(state_dict, pretrained_weights): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print(f"Loading {pretrained_weights} for sanity check") + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint["state_dict"] + + for k in list(state_dict.keys()): + # only ignore fc layer + if "fc.weight" in k or "fc.bias" in k: + continue + + # name in pretrained model + k_pre = ( + "module.encoder." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder." + k + ) + + assert ( + state_dict[k].cpu() == state_dict_pre[k_pre] + ).all(), "{} is changed in linear classifier training.".format(k) + + print("Sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group["lr"] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() From a16c5a8cf165442d342ababa90d4c469f7a61ba7 Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Mon, 4 Mar 2024 14:23:08 -0500 Subject: [PATCH 09/38] changes to multiple scripts --- ...line_50.slrm => eval_simsiam_baseline.slrm | 13 ++- ...blediff_50.slrm => eval_simsiam_icgan.slrm | 14 ++- ...an_50.slrm => eval_simsiam_stablediff.slrm | 13 ++- simsiam/LARC.py | 105 ++++++++++++++++++ simsiam/linear_eval.py | 5 +- simsiam/main_simsiam.py | 73 +++++++++++- train_simsiam.slrm | 28 ++--- 7 files changed, 214 insertions(+), 37 deletions(-) rename eval_simsiam_baseline_50.slrm => eval_simsiam_baseline.slrm (74%) rename eval_simsiam_stablediff_50.slrm => eval_simsiam_icgan.slrm (75%) rename eval_simsiam_icgan_50.slrm => eval_simsiam_stablediff.slrm (74%) create mode 100644 simsiam/LARC.py diff --git a/eval_simsiam_baseline_50.slrm b/eval_simsiam_baseline.slrm similarity index 74% rename from eval_simsiam_baseline_50.slrm rename to eval_simsiam_baseline.slrm index 11f5c5e..44a6299 100644 --- a/eval_simsiam_baseline_50.slrm +++ b/eval_simsiam_baseline.slrm @@ -4,8 +4,9 @@ #SBATCH --partition=a40 #SBATCH --account=deadline #SBATCH --qos=deadline -#SBATCH --nodes=2 -#SBATCH --gres=gpu:8 +#SBATCH --time=72:00:00 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu=8G @@ -30,6 +31,8 @@ srun python simsiam/linear_eval.py \ --data_dir="/scratch/ssd004/datasets/imagenet256" \ --arch="resnet50" \ --distributed_mode \ ---batch-size=256 \ ---epochs=100 \ ---pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-29-09-27/checkpoint_0050.pth.tar" +--batch-size=1024 \ +--lars \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-29-14-49/checkpoint_0099.pth.tar" \ + +# --checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/eval_original_simsiam_baseline" diff --git a/eval_simsiam_stablediff_50.slrm b/eval_simsiam_icgan.slrm similarity index 75% rename from eval_simsiam_stablediff_50.slrm rename to eval_simsiam_icgan.slrm index 6b46de6..6c1070a 100644 --- a/eval_simsiam_stablediff_50.slrm +++ b/eval_simsiam_icgan.slrm @@ -4,14 +4,14 @@ #SBATCH --partition=a40 #SBATCH --account=deadline #SBATCH --qos=deadline -#SBATCH --nodes=2 -#SBATCH --gres=gpu:8 +#SBATCH --time=72:00:00 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu=8G #SBATCH --output=slurm-%j.out - PY_ARGS=${@:1} # load virtual environment @@ -31,6 +31,8 @@ srun python simsiam/linear_eval.py \ --data_dir="/scratch/ssd004/datasets/imagenet256" \ --arch="resnet50" \ --distributed_mode \ ---batch-size=256 \ ---epochs=100 \ ---pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-29-09-33/checkpoint_0050.pth.tar" +--batch-size=1024 \ +--lars \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_2024-02-29-18-40/checkpoint_0099.pth.tar" \ + +# --checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/eval_simsiam_icgan_ep0099" \ No newline at end of file diff --git a/eval_simsiam_icgan_50.slrm b/eval_simsiam_stablediff.slrm similarity index 74% rename from eval_simsiam_icgan_50.slrm rename to eval_simsiam_stablediff.slrm index de9ce1a..9274bbe 100644 --- a/eval_simsiam_icgan_50.slrm +++ b/eval_simsiam_stablediff.slrm @@ -2,10 +2,11 @@ #SBATCH --job-name="simsiam_eval" #SBATCH --partition=a40 +#SBATCH --time=72:00:00 #SBATCH --account=deadline #SBATCH --qos=deadline -#SBATCH --nodes=2 -#SBATCH --gres=gpu:8 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu=8G @@ -30,6 +31,8 @@ srun python simsiam/linear_eval.py \ --data_dir="/scratch/ssd004/datasets/imagenet256" \ --arch="resnet50" \ --distributed_mode \ ---batch-size=256 \ ---epochs=100 \ ---pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_2024-02-29-09-30/checkpoint_0050.pth.tar" +--batch-size=1024 \ +--lars \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-29-15-27/checkpoint_0099.pth.tar" \ + +# --checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/eval_simsiam_stablediff_ep0099" \ No newline at end of file diff --git a/simsiam/LARC.py b/simsiam/LARC.py new file mode 100644 index 0000000..692ba18 --- /dev/null +++ b/simsiam/LARC.py @@ -0,0 +1,105 @@ +import torch + + +class LARC(object): + """ + :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC, + in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive + local learning rate for each individual parameter. The algorithm is designed to improve + convergence of large batch training. + + See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate. + In practice it modifies the gradients of parameters as a proxy for modifying the learning rate + of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer. + ``` + model = ... + optim = torch.optim.Adam(model.parameters(), lr=...) + optim = LARC(optim) + ``` + It can even be used in conjunction with apex.fp16_utils.FP16_optimizer. + ``` + model = ... + optim = torch.optim.Adam(model.parameters(), lr=...) + optim = LARC(optim) + optim = apex.fp16_utils.FP16_Optimizer(optim) + ``` + Args: + optimizer: Pytorch optimizer to wrap and modify learning rate for. + trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888 + clip: Decides between clipping or scaling mode of LARC. If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter. If `clip=False` the learning rate is set to `local_lr*optimizer_lr`. + eps: epsilon kludge to help with numerical stability while calculating adaptive_lr + """ + + def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8): + self.optim = optimizer + self.trust_coefficient = trust_coefficient + self.eps = eps + self.clip = clip + + def __getstate__(self): + return self.optim.__getstate__() + + def __setstate__(self, state): + self.optim.__setstate__(state) + + @property + def state(self): + return self.optim.state + + def __repr__(self): + return self.optim.__repr__() + + @property + def param_groups(self): + return self.optim.param_groups + + @param_groups.setter + def param_groups(self, value): + self.optim.param_groups = value + + def state_dict(self): + return self.optim.state_dict() + + def load_state_dict(self, state_dict): + self.optim.load_state_dict(state_dict) + + def zero_grad(self): + self.optim.zero_grad() + + def add_param_group(self, param_group): + self.optim.add_param_group(param_group) + + def step(self): + with torch.no_grad(): + weight_decays = [] + for group in self.optim.param_groups: + # absorb weight decay control from optimizer + weight_decay = group["weight_decay"] if "weight_decay" in group else 0 + weight_decays.append(weight_decay) + group["weight_decay"] = 0 + for p in group["params"]: + if p.grad is None: + continue + param_norm = torch.norm(p.data) + grad_norm = torch.norm(p.grad.data) + + if param_norm != 0 and grad_norm != 0: + # calculate adaptive lr + weight decay + adaptive_lr = ( + self.trust_coefficient + * (param_norm) + / (grad_norm + param_norm * weight_decay + self.eps) + ) + + # clip learning rate for LARC + if self.clip: + # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)` + adaptive_lr = min(adaptive_lr / group["lr"], 1) + + p.grad.data += weight_decay * p.data + p.grad.data *= adaptive_lr + + self.optim.step() + # return weight decay control to optimizer + for i, group in enumerate(self.optim.param_groups): + group["weight_decay"] = weight_decays[i] diff --git a/simsiam/linear_eval.py b/simsiam/linear_eval.py index a088d2e..9bf31a4 100755 --- a/simsiam/linear_eval.py +++ b/simsiam/linear_eval.py @@ -115,7 +115,6 @@ def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: """Initialize worker processes with a random seed. - Parameters ---------- worker_id : int @@ -182,7 +181,7 @@ def main(): raise ValueError(f"No checkpoint found at: {args.pretrained_checkpoint}") # infer learning rate before changing batch size - init_lr = args.lr * args.batch_size / 256 + init_lr = args.lr * args.batch_size * 4.0 / 256.0 if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): torch.cuda.set_device(device_id) @@ -203,7 +202,7 @@ def main(): ) if args.lars: print("Use LARS optimizer.") - from apex.parallel.LARC import LARC + from LARC import LARC optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) diff --git a/simsiam/main_simsiam.py b/simsiam/main_simsiam.py index e486c4b..56da4ea 100755 --- a/simsiam/main_simsiam.py +++ b/simsiam/main_simsiam.py @@ -138,7 +138,7 @@ ) parser.add_argument( "--synthetic_data_dir", - default="/projects/imagenet_synthetic/synthetic_icgan", + default="/projects/imagenet_synthetic/", help="Path to the root of synthetic data.", ) parser.add_argument( @@ -159,6 +159,14 @@ type=float, help="The probability of applying a generative model augmentation to a view. Applies to the views separately.", ) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: @@ -270,7 +278,10 @@ def main(): optim_params = model.parameters() # infer learning rate before changing batch size - init_lr = args.lr * args.batch_size / 256.0 + # init_lr = args.lr * args.batch_size / 256.0 + # TODO(arashaf): Hard-code init-lr to match the original paper with bs=512. + init_lr = args.lr * 2.0 + optimizer = torch.optim.SGD( optim_params, init_lr, @@ -319,11 +330,18 @@ def main(): def train(train_loader, model, criterion, optimizer, epoch, device_id, args): """Single epoch training code.""" + losses = AverageMeter("Loss", ":.4f") + progress = ProgressMeter( + len(train_loader), + [losses], + prefix="Epoch: [{}]".format(epoch), + ) + # switch to train mode model.train() - # for i, (images, _) in enumerate(train_loader): - for images, _ in tqdm(train_loader): + for i, (images, _) in enumerate(train_loader): + # for images, _ in tqdm(train_loader): images[0] = images[0].cuda(device_id, non_blocking=True) images[1] = images[1].cuda(device_id, non_blocking=True) @@ -331,11 +349,16 @@ def train(train_loader, model, criterion, optimizer, epoch, device_id, args): p1, p2, z1, z2 = model(x1=images[0], x2=images[1]) loss = -(criterion(p1, z2).mean() + criterion(p2, z1).mean()) * 0.5 + losses.update(loss.item(), images[0].size(0)) + # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() + if i % args.print_freq == 0: + progress.display(i) + def save_checkpoint(state, filename="checkpoint.pth.tar"): """Save state dictionary into a model checkpoint.""" @@ -343,6 +366,48 @@ def save_checkpoint(state, filename="checkpoint.pth.tar"): torch.save(state, filename) +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + def adjust_learning_rate(optimizer, init_lr, epoch, args): """Decay the learning rate based on schedule.""" cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) diff --git a/train_simsiam.slrm b/train_simsiam.slrm index 4c8f46d..dc8d23f 100644 --- a/train_simsiam.slrm +++ b/train_simsiam.slrm @@ -1,14 +1,15 @@ #!/bin/bash -#SBATCH --job-name="simsiam_baseline" -#SBATCH --partition=a100 -#SBATCH --qos=a100_arashaf +#SBATCH --job-name="simsiam_train" +#SBATCH --partition=a40 +#SBATCH --account=deadline +#SBATCH --qos=deadline #SBATCH --time=72:00:00 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu=4G +#SBATCH --mem-per-cpu=8G #SBATCH --output=slurm-%j.out PY_ARGS=${@:1} @@ -30,16 +31,15 @@ srun python simsiam/main_simsiam.py \ -a resnet50 \ --fix-pred-lr \ --distributed_mode \ ---batch-size=256 \ +--batch-size=128 \ --epochs=100 \ ---experiment="simsiam_baseline" \ ---resume_from_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-27-16-48/checkpoint_22.pth.tar" +--experiment="simsiam_stablediff_p0p5_seed43" \ +--resume_from_checkpoint="" \ +--seed=43 \ +--use_synthetic_data \ +--synthetic_data_dir="/projects/imagenet_synthetic/arashaf_stablediff_batched" \ +--synthetic_index_min=0 \ +--synthetic_index_max=1 \ +--generative_augmentation_prob=0.5 - - -# --use_synthetic_data \ -# --synthetic_data_dir="/projects/imagenet_synthetic/synthetic_icgan" \ -# --synthetic_index_min=0 \ -# --synthetic_index_max=4 \ -# --generative_augmentation_prob=0.5 \ From 18d879826fdcb88535a36bc215df9eb7aad59700 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 4 Mar 2024 12:31:06 -0800 Subject: [PATCH 10/38] add adil multi node script --- simsiam/adil_linear_eval.py | 464 ++++++++++++++++++++++++++++++++++ simsiam/adil_main_simsiam.py | 436 ++++++++++++++++++++++++++++++++ train_simsiam_multinode.slrm | 57 +++++ train_simsiam_singlenode.slrm | 40 +++ 4 files changed, 997 insertions(+) create mode 100644 simsiam/adil_linear_eval.py create mode 100644 simsiam/adil_main_simsiam.py create mode 100644 train_simsiam_multinode.slrm create mode 100644 train_simsiam_singlenode.slrm diff --git a/simsiam/adil_linear_eval.py b/simsiam/adil_linear_eval.py new file mode 100644 index 0000000..3911691 --- /dev/null +++ b/simsiam/adil_linear_eval.py @@ -0,0 +1,464 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import math +import os +import random +import shutil +from functools import partial + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +from torch import nn +from torch.backends import cudnn +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torchvision import datasets, models, transforms +from tqdm import tqdm + +from SimCLR import distributed as dist_utils +from torch import distributed as dist + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data_dir", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=4096, + type=int, + metavar="N", + help="mini-batch size (default: 4096), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument( + "--distributed_mode", + action="store_true", + help="Enable distributed training", +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) +parser.add_argument( + "--pretrained_checkpoint", + default="", + type=str, + help="Path to simsiam pretrained checkpoint.", +) +parser.add_argument("--lars", action="store_true", help="Use LARS") +parser.add_argument( + "--checkpoint_dir", + default="", + help="Checkpoint directory to save eval model checkpoints.", +) + + +best_acc1 = 0 + + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + +def setup() -> None: + """Initialize the process group.""" + dist.init_process_group("nccl") + + +def cleanup() -> None: + """Clean up the process group after training.""" + dist.destroy_process_group() + + +def main(): + args = parser.parse_args() + global best_acc1 + + # torch.multiprocessing.set_start_method("spawn") + if args.distributed_mode: + # dist_utils.init_distributed_mode( + # launcher=args.distributed_launcher, + # backend=args.distributed_backend, + # ) + setup() + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + torch.cuda.empty_cache() + device_id = torch.cuda.current_device() + else: + device_id = None + + # create model + print(f"Creating model {args.arch}") + model = models.__dict__[args.arch]() + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained_checkpoint: + if os.path.isfile(args.pretrained_checkpoint): + print(f"Loading checkpoint {args.pretrained_checkpoint}") + checkpoint = torch.load(args.pretrained_checkpoint, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith("module.encoder") and not k.startswith( + "module.encoder.fc" + ): + # remove prefix + state_dict[k[len("module.encoder.") :]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + else: + raise ValueError(f"No checkpoint found at: {args.pretrained_checkpoint}") + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + # torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + raise NotImplementedError("Only DistributedDataParallel is supported.") + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(device_id) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + optimizer = torch.optim.SGD( + parameters, init_lr, momentum=args.momentum, weight_decay=args.weight_decay + ) + if args.lars: + print("Use LARS optimizer.") + # from apex.parallel.LARC import LARC + from LARC import LARC + + optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) + + cudnn.benchmark = True + + # Data loading code + train_dir = os.path.join(args.data_dir, "train") + val_dir = os.path.join(args.data_dir, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + train_dataset = datasets.ImageFolder( + train_dir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, + seed=args.seed, + ) + else: + train_sampler = None + + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=True, # TODO(arashaf): this was set to false in training script. + ) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder( + val_dir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ), + batch_size=256, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + ) + + for epoch in tqdm(range(args.epochs)): + print(f"Starting training epoch: {epoch}") + if dist_utils.is_dist_avail_and_initialized(): + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, device_id, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, device_id, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if args.checkpoint_dir and dist_utils.get_rank() == 0: + os.makedirs(args.checkpoint_dir, exist_ok=True) + checkpoint_name = "eval_checkpoint_{:04d}.pth.tar".format(epoch) + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + checkpoint_file, + ) + if epoch == 0: + sanity_check(model.state_dict(), args.pretrained_checkpoint) + + +def train(train_loader, model, criterion, optimizer, epoch, device_id, args): + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + for images, target in tqdm(train_loader): + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def validate(val_loader, model, criterion, device_id, args): + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + for images, target in tqdm(val_loader): + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + print( + "Validation Accuracy@1 {top1.avg:.3f}, Accuracy@5 {top5.avg:.3f}".format( + top1=top1, top5=top5 + ) + ) + + return top1.avg + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + print(f"Saving checkpoint at: {filename}") + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") + + +def sanity_check(state_dict, pretrained_weights): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print(f"Loading {pretrained_weights} for sanity check") + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint["state_dict"] + + for k in list(state_dict.keys()): + # only ignore fc layer + if "fc.weight" in k or "fc.bias" in k: + continue + + # name in pretrained model + k_pre = ( + "module.encoder." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder." + k + ) + + assert ( + state_dict[k].cpu() == state_dict_pre[k_pre] + ).all(), "{} is changed in linear classifier training.".format(k) + + print("Sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group["lr"] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() diff --git a/simsiam/adil_main_simsiam.py b/simsiam/adil_main_simsiam.py new file mode 100644 index 0000000..ec6a5b4 --- /dev/null +++ b/simsiam/adil_main_simsiam.py @@ -0,0 +1,436 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import math +import os +import random +from datetime import datetime +from functools import partial + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +from torch import nn +from torch.backends import cudnn +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torch.utils.data.distributed import DistributedSampler +from torchvision import datasets, models +from tqdm import tqdm + +from SimCLR import distributed as dist_utils +from simsiam import builder, loader +from torch import distributed as dist + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data_dir", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=100, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=256, + type=int, + metavar="N", + help="mini-batch size (default: 512), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.05, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument( + "--momentum", default=0.9, type=float, metavar="M", help="momentum of SGD solver" +) +parser.add_argument( + "--wd", + "--weight-decay", + default=1e-4, + type=float, + metavar="W", + help="weight decay (default: 1e-4)", + dest="weight_decay", +) +parser.add_argument( + "--resume_from_checkpoint", + default="", + type=str, + help="Path to latest checkpoint.", +) +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) + +# simsiam specific configs: +parser.add_argument( + "--dim", default=2048, type=int, help="feature dimension (default: 2048)" +) +parser.add_argument( + "--pred-dim", + default=512, + type=int, + help="hidden dimension of the predictor (default: 512)", +) +parser.add_argument( + "--fix-pred-lr", action="store_true", help="Fix learning rate for the predictor" +) + +parser.add_argument( + "--distributed_mode", + action="store_true", + help="Enable distributed training", +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument( + "--checkpoint_dir", + default="/projects/imagenet_synthetic/model_checkpoints", + help="Checkpoint root directory.", +) +parser.add_argument( + "--experiment", + default="", + help="Experiment name.", +) +parser.add_argument( + "--use_synthetic_data", + action=argparse.BooleanOptionalAction, + help="Whether to use real data or synthetic data for training.", +) +parser.add_argument( + "--synthetic_data_dir", + default="/projects/imagenet_synthetic/", + help="Path to the root of synthetic data.", +) +parser.add_argument( + "--synthetic_index_min", + default=0, + type=int, + help="Synthetic data files are named filename_i.JPEG. This index determines the lower bound for i.", +) +parser.add_argument( + "--synthetic_index_max", + default=9, + type=int, + help="Synthetic data files are named filename_i.JPEG. This index determines the upper bound for i.", +) +parser.add_argument( + "--generative_augmentation_prob", + default=None, + type=float, + help="The probability of applying a generative model augmentation to a view. Applies to the views separately.", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) + + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + +def setup() -> None: + """Initialize the process group.""" + dist.init_process_group("nccl") + + +def cleanup() -> None: + """Clean up the process group after training.""" + dist.destroy_process_group() + + +def main(): + args = parser.parse_args() + current_time = datetime.now().strftime("%Y-%m-%d-%H-%M") + checkpoint_subdir = ( + f"{args.experiment}_{current_time}" if args.experiment else f"{current_time}" + ) + args.checkpoint_dir = os.path.join(args.checkpoint_dir, checkpoint_subdir) + os.makedirs(args.checkpoint_dir, exist_ok=True) + + print(args) + + # torch.multiprocessing.set_start_method("spawn") + # torch.multiprocessing.set_start_method("spawn") + if args.distributed_mode: + # dist_utils.init_distributed_mode( + # launcher=args.distributed_launcher, + # backend=args.distributed_backend, + # ) + setup() + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + torch.cuda.empty_cache() + device_id = torch.cuda.current_device() + else: + device_id = None + + # Data loading. + if args.use_synthetic_data: + print( + f"Using synthetic data for training at {args.synthetic_data_dir} between indices {args.synthetic_index_min} and {args.synthetic_index_max}." + ) + train_dataset = loader.ImageNetSynthetic( + args.data_dir, + args.synthetic_data_dir, + index_min=args.synthetic_index_min, + index_max=args.synthetic_index_max, + generative_augmentation_prob=args.generative_augmentation_prob, + ) + else: + print(f"Using real data for training at {args.data_dir}.") + train_data_dir = os.path.join(args.data_dir, "train") + train_dataset = datasets.ImageFolder(train_data_dir, loader.TwoCropsTransform()) + + train_sampler = None + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = DistributedSampler( + train_dataset, + seed=args.seed, + drop_last=True, + ) + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=False, + drop_last=True, + ) + + print(f"Creating model {args.arch}") + model = builder.SimSiam(models.__dict__[args.arch], args.dim, args.pred_dim) + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + # Apply SyncBN + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + # set the single device scope, otherwise DistributedDataParallel will + # use all available devices + torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + raise NotImplementedError("Only DistributedDataParallel is supported.") + print(model) # print model after SyncBatchNorm + + # define loss function (criterion) and optimizer + criterion = nn.CosineSimilarity(dim=1).cuda(device_id) + + if args.fix_pred_lr: + optim_params = [ + {"params": model.module.encoder.parameters(), "fix_lr": False}, + {"params": model.module.predictor.parameters(), "fix_lr": True}, + ] + else: + optim_params = model.parameters() + + # infer learning rate before changing batch size + # init_lr = args.lr * args.batch_size / 256.0 + # TODO(arashaf): Hard-code init-lr to match the original paper with bs=512. + init_lr = args.lr * 2.0 + + optimizer = torch.optim.SGD( + optim_params, + init_lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + + start_epoch = 0 + # Optionally resume from a checkpoint + if args.resume_from_checkpoint: + if os.path.isfile(args.resume_from_checkpoint): + print(f"Loading checkpoint: {args.resume_from_checkpoint}") + checkpoint = torch.load(args.resume_from_checkpoint) + start_epoch = checkpoint["epoch"] + 1 + model.load_state_dict(checkpoint["state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + print(f"Loaded checkpoint {args.resume_from_checkpoint} successfully.") + else: + raise ValueError(f"No checkpoint found at: {args.resume_from_checkpoint}") + + cudnn.benchmark = True + + for epoch in range(start_epoch, args.epochs): + print(f"Starting training epoch: {epoch}") + if dist_utils.is_dist_avail_and_initialized(): + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, device_id, args) + + # Checkpointing. + if dist_utils.get_rank() == 0: + checkpoint_name = "checkpoint_{:04d}.pth.tar".format(epoch) + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), + }, + filename=checkpoint_file, + ) + + +def train(train_loader, model, criterion, optimizer, epoch, device_id, args): + """Single epoch training code.""" + losses = AverageMeter("Loss", ":.4f") + progress = ProgressMeter( + len(train_loader), + [losses], + prefix="Epoch: [{}]".format(epoch), + ) + + # switch to train mode + model.train() + + for i, (images, _) in enumerate(train_loader): + # for images, _ in tqdm(train_loader): + images[0] = images[0].cuda(device_id, non_blocking=True) + images[1] = images[1].cuda(device_id, non_blocking=True) + + # compute output and loss + p1, p2, z1, z2 = model(x1=images[0], x2=images[1]) + loss = -(criterion(p1, z2).mean() + criterion(p2, z1).mean()) * 0.5 + + losses.update(loss.item(), images[0].size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if i % args.print_freq == 0: + progress.display(i) + + +def save_checkpoint(state, filename="checkpoint.pth.tar"): + """Save state dictionary into a model checkpoint.""" + print(f"Saving checkpoint at: {filename}") + torch.save(state, filename) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule.""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + if "fix_lr" in param_group and param_group["fix_lr"]: + param_group["lr"] = init_lr + else: + param_group["lr"] = cur_lr + + +if __name__ == "__main__": + main() diff --git a/train_simsiam_multinode.slrm b/train_simsiam_multinode.slrm new file mode 100644 index 0000000..8403149 --- /dev/null +++ b/train_simsiam_multinode.slrm @@ -0,0 +1,57 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_multi_train" +#SBATCH --partition=a40 +#SBATCH --qos=m2 +#SBATCH --nodes=2 +#SBATCH --gres=gpu:a40:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=multinode-%j.out +#SBATCH --error=multinode-%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=08:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 + + +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +export PYTHONPATH="." +nvidia-smi + +srun -p $SLURM_JOB_PARTITION \ + -c $SLURM_CPUS_ON_NODE \ + -N $SLURM_JOB_NUM_NODES \ + --mem=0 \ + --gres=gpu:$SLURM_JOB_PARTITION:$SLURM_GPUS_ON_NODE \ + bash -c 'torchrun \ + --nproc-per-node=$SLURM_GPUS_ON_NODE \ + --nnodes=$SLURM_JOB_NUM_NODES \ + --rdzv-endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv-id $RDVZ_ID \ + --rdzv-backend c10d \ + simsiam/adil_main_simsiam.py \ + -a resnet50 \ + --fix-pred-lr \ + --distributed_mode \ + --batch-size=128 \ + --epochs=100 \ + --experiment="simsiam_stablediff_p0p5_seed43" \ + --resume_from_checkpoint="" \ + --seed=43 \ + --use_synthetic_data \ + --synthetic_data_dir="/projects/imagenet_synthetic/arashaf_stablediff_batched" \ + --synthetic_index_min=0 \ + --synthetic_index_max=1 \ + --generative_augmentation_prob=0.5' \ No newline at end of file diff --git a/train_simsiam_singlenode.slrm b/train_simsiam_singlenode.slrm new file mode 100644 index 0000000..b930947 --- /dev/null +++ b/train_simsiam_singlenode.slrm @@ -0,0 +1,40 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_single_train" +#SBATCH --partition=a40 +#SBATCH --qos=m +#SBATCH --nodes=1 +#SBATCH --gres=gpu:a40:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=singlenode-%j.out +#SBATCH --error=singlenode-%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=12:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 + +export PYTHONPATH="." +nvidia-smi + +torchrun --nproc-per-node=4 --nnodes=1 simsiam/adil_main_simsiam.py \ + -a resnet50 \ + --fix-pred-lr \ + --distributed_mode \ + --batch-size=128 \ + --epochs=100 \ + --experiment="simsiam_stablediff_p0p5_seed43" \ + --resume_from_checkpoint="" \ + --seed=43 \ + --use_synthetic_data \ + --synthetic_data_dir="/projects/imagenet_synthetic/arashaf_stablediff_batched" \ + --synthetic_index_min=0 \ + --synthetic_index_max=1 \ + --generative_augmentation_prob=0.5 \ No newline at end of file From 984cc36e53d4017520881c383cda42307fb82512 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 4 Mar 2024 14:04:05 -0800 Subject: [PATCH 11/38] add add multi gpu to original loging --- simsiam/adil_linear_eval_original_logs.py | 525 ++++++++++++++++++++++ simsiam/adil_main_simsiam.py | 2 +- simsiam/linear_eval_original_logs.py | 2 +- 3 files changed, 527 insertions(+), 2 deletions(-) create mode 100644 simsiam/adil_linear_eval_original_logs.py diff --git a/simsiam/adil_linear_eval_original_logs.py b/simsiam/adil_linear_eval_original_logs.py new file mode 100644 index 0000000..537040b --- /dev/null +++ b/simsiam/adil_linear_eval_original_logs.py @@ -0,0 +1,525 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +import argparse +import math +import os +import random +import shutil +import time +from functools import partial + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +from torch import nn +from torch.backends import cudnn +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torchvision import datasets, models, transforms +from tqdm import tqdm + +from SimCLR import distributed as dist_utils +from LARC import LARC +from torch import distributed as dist + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data_dir", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=4096, + type=int, + metavar="N", + help="mini-batch size (default: 4096), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument( + "--distributed_mode", + action="store_true", + help="Enable distributed training", +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) +parser.add_argument( + "--pretrained_checkpoint", + default="", + type=str, + help="Path to simsiam pretrained checkpoint.", +) +parser.add_argument("--lars", action="store_true", help="Use LARS") +parser.add_argument( + "--checkpoint_dir", + default="", + help="Checkpoint directory to save eval model checkpoints.", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) + +best_acc1 = 0 + + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + +def setup() -> None: + """Initialize the process group.""" + dist.init_process_group("nccl") + + +def cleanup() -> None: + """Clean up the process group after training.""" + dist.destroy_process_group() + + +def main(): + args = parser.parse_args() + global best_acc1 + + # torch.multiprocessing.set_start_method("spawn") + if args.distributed_mode: + setup() + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + torch.cuda.empty_cache() + device_id = torch.cuda.current_device() + else: + device_id = None + + # create model + print(f"Creating model {args.arch}") + model = models.__dict__[args.arch]() + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained_checkpoint: + if os.path.isfile(args.pretrained_checkpoint): + print(f"Loading checkpoint {args.pretrained_checkpoint}") + checkpoint = torch.load(args.pretrained_checkpoint, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith("module.encoder") and not k.startswith( + "module.encoder.fc" + ): + # remove prefix + state_dict[k[len("module.encoder.") :]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + else: + raise ValueError(f"No checkpoint found at: {args.pretrained_checkpoint}") + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + # torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + raise NotImplementedError("Only DistributedDataParallel is supported.") + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(device_id) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + # TODO(arashaf): Enable Adam optimizer + optimizer = torch.optim.SGD( + parameters, + init_lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + if args.lars: + print("Use LARS optimizer.") + LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) + + cudnn.benchmark = True + + # Data loading code + train_dir = os.path.join(args.data_dir, "train") + val_dir = os.path.join(args.data_dir, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + train_dataset = datasets.ImageFolder( + train_dir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, + seed=args.seed, + ) + else: + train_sampler = None + + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=True, # TODO(arashaf): this was set to false in training script. + ) + + val_dataset = datasets.ImageFolder( + val_dir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ) + val_loader = torch.utils.data.DataLoader( + val_dataset, + batch_size=256, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + ) + + for epoch in tqdm(range(args.epochs)): + print(f"Starting training epoch: {epoch}") + if dist_utils.is_dist_avail_and_initialized(): + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, device_id, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, device_id, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if args.checkpoint_dir and dist_utils.get_rank() == 0: + os.makedirs(args.checkpoint_dir, exist_ok=True) + checkpoint_name = "eval_checkpoint_{:04d}.pth.tar".format(epoch) + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + checkpoint_file, + ) + if epoch == 0: + sanity_check(model.state_dict(), args.pretrained_checkpoint) + + +def train(train_loader, model, criterion, optimizer, epoch, device_id, args): + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch), + ) + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + +def validate(val_loader, model, criterion, device_id, args): + batch_time = AverageMeter("Time", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " + ) + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + print( + "Validation Accuracy@1 {top1.avg:.3f}, Accuracy@5 {top5.avg:.3f}".format( + top1=top1, + top5=top5, + ) + ) + + return top1.avg + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + print(f"Saving checkpoint at: {filename}") + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") + + +def sanity_check(state_dict, pretrained_weights): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print(f"Loading {pretrained_weights} for sanity check") + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint["state_dict"] + + for k in list(state_dict.keys()): + # only ignore fc layer + if "fc.weight" in k or "fc.bias" in k: + continue + + # name in pretrained model + k_pre = ( + "module.encoder." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder." + k + ) + + assert ( + state_dict[k].cpu() == state_dict_pre[k_pre] + ).all(), "{} is changed in linear classifier training.".format(k) + + print("Sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group["lr"] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() diff --git a/simsiam/adil_main_simsiam.py b/simsiam/adil_main_simsiam.py index ec6a5b4..09af62e 100644 --- a/simsiam/adil_main_simsiam.py +++ b/simsiam/adil_main_simsiam.py @@ -273,7 +273,7 @@ def main(): model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) # set the single device scope, otherwise DistributedDataParallel will # use all available devices - torch.cuda.set_device(device_id) + # torch.cuda.set_device(device_id) model = model.cuda(device_id) model = DDP(model, device_ids=[device_id]) else: diff --git a/simsiam/linear_eval_original_logs.py b/simsiam/linear_eval_original_logs.py index 34dd262..a3ced4e 100755 --- a/simsiam/linear_eval_original_logs.py +++ b/simsiam/linear_eval_original_logs.py @@ -23,7 +23,7 @@ from tqdm import tqdm from SimCLR import distributed as dist_utils -from simsiam.lars_optimizer import LARC +from LARC import LARC model_names = sorted( From 653c80c273d517d74a3518388af58d4295fbfba3 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 4 Mar 2024 14:10:13 -0800 Subject: [PATCH 12/38] add eval scripts --- eval_simsiam_multinode.slrm | 51 ++++++++++++++++++++++++++++++++++++ eval_simsiam_singlenode.slrm | 33 +++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 eval_simsiam_multinode.slrm create mode 100644 eval_simsiam_singlenode.slrm diff --git a/eval_simsiam_multinode.slrm b/eval_simsiam_multinode.slrm new file mode 100644 index 0000000..1696a27 --- /dev/null +++ b/eval_simsiam_multinode.slrm @@ -0,0 +1,51 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_multi_train" +#SBATCH --partition=a40 +#SBATCH --qos=m2 +#SBATCH --nodes=2 +#SBATCH --gres=gpu:a40:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=multinode-%j.out +#SBATCH --error=multinode-%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=08:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 + + +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +export PYTHONPATH="." +nvidia-smi + +srun -p $SLURM_JOB_PARTITION \ + -c $SLURM_CPUS_ON_NODE \ + -N $SLURM_JOB_NUM_NODES \ + --mem=0 \ + --gres=gpu:$SLURM_JOB_PARTITION:$SLURM_GPUS_ON_NODE \ + bash -c 'torchrun \ + --nproc-per-node=$SLURM_GPUS_ON_NODE \ + --nnodes=$SLURM_JOB_NUM_NODES \ + --rdzv-endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv-id $RDVZ_ID \ + --rdzv-backend c10d \ + simsiam/adil_linear_eval_original_logs.py \ + --data_dir="/scratch/ssd004/datasets/imagenet256" \ + --arch="resnet50" \ + --distributed_mode \ + --batch-size=1024 \ + --lars \ + --pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-29-15-27/checkpoint_0099.pth.tar" / + ' \ No newline at end of file diff --git a/eval_simsiam_singlenode.slrm b/eval_simsiam_singlenode.slrm new file mode 100644 index 0000000..efeacb2 --- /dev/null +++ b/eval_simsiam_singlenode.slrm @@ -0,0 +1,33 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_single_train" +#SBATCH --partition=a40 +#SBATCH --qos=m +#SBATCH --nodes=1 +#SBATCH --gres=gpu:a40:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=singlenode-%j.out +#SBATCH --error=singlenode-%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=12:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 + +export PYTHONPATH="." +nvidia-smi + +torchrun --nproc-per-node=4 --nnodes=1 simsiam/adil_linear_eval_original_logs.py \ + --data_dir="/scratch/ssd004/datasets/imagenet256" \ + --arch="resnet50" \ + --distributed_mode \ + --batch-size=1024 \ + --lars \ + --pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-29-15-27/checkpoint_0099.pth.tar" \ No newline at end of file From 4762d19e371d52ccb4ccd2dfc874801924491ec2 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 4 Mar 2024 15:58:23 -0800 Subject: [PATCH 13/38] update training --- simsiam/adil_main_simsiam.py | 7 ++++--- train_simsiam_multinode.slrm | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/simsiam/adil_main_simsiam.py b/simsiam/adil_main_simsiam.py index 09af62e..3db9cb8 100644 --- a/simsiam/adil_main_simsiam.py +++ b/simsiam/adil_main_simsiam.py @@ -264,8 +264,8 @@ def main(): pin_memory=False, drop_last=True, ) - - print(f"Creating model {args.arch}") + if dist_utils.get_rank() == 0: + print(f"Creating model {args.arch}") model = builder.SimSiam(models.__dict__[args.arch], args.dim, args.pred_dim) if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): @@ -278,7 +278,8 @@ def main(): model = DDP(model, device_ids=[device_id]) else: raise NotImplementedError("Only DistributedDataParallel is supported.") - print(model) # print model after SyncBatchNorm + if dist_utils.get_rank() == 0: + print(model) # print model after SyncBatchNorm # define loss function (criterion) and optimizer criterion = nn.CosineSimilarity(dim=1).cuda(device_id) diff --git a/train_simsiam_multinode.slrm b/train_simsiam_multinode.slrm index 8403149..c3b5be4 100644 --- a/train_simsiam_multinode.slrm +++ b/train_simsiam_multinode.slrm @@ -46,9 +46,9 @@ srun -p $SLURM_JOB_PARTITION \ --fix-pred-lr \ --distributed_mode \ --batch-size=128 \ - --epochs=100 \ + --epochs=200 \ --experiment="simsiam_stablediff_p0p5_seed43" \ - --resume_from_checkpoint="" \ + --resume_from_checkpoint="/projects/imagenet_synthetic/model_checkpoints/_original_simsiam/checkpoint_0099.pth.tar" \ --seed=43 \ --use_synthetic_data \ --synthetic_data_dir="/projects/imagenet_synthetic/arashaf_stablediff_batched" \ From 16e683d56b2bfad8bd6faf2ba23d723731140668 Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Tue, 5 Mar 2024 11:17:42 -0500 Subject: [PATCH 14/38] changed the train slurm script --- simsiam/adil_main_simsiam.py | 3 ++- train_simsiam_multinode.slrm | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/simsiam/adil_main_simsiam.py b/simsiam/adil_main_simsiam.py index 3db9cb8..bdd3de9 100644 --- a/simsiam/adil_main_simsiam.py +++ b/simsiam/adil_main_simsiam.py @@ -16,6 +16,7 @@ import torch.optim import torch.utils.data import torch.utils.data.distributed +from torch import distributed as dist from torch import nn from torch.backends import cudnn from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 @@ -25,7 +26,6 @@ from SimCLR import distributed as dist_utils from simsiam import builder, loader -from torch import distributed as dist model_names = sorted( @@ -188,6 +188,7 @@ def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> No torch.manual_seed(worker_seed) random.seed(worker_seed) + def setup() -> None: """Initialize the process group.""" dist.init_process_group("nccl") diff --git a/train_simsiam_multinode.slrm b/train_simsiam_multinode.slrm index c3b5be4..f90f3c8 100644 --- a/train_simsiam_multinode.slrm +++ b/train_simsiam_multinode.slrm @@ -2,17 +2,17 @@ #SBATCH --job-name="simsiam_multi_train" #SBATCH --partition=a40 -#SBATCH --qos=m2 +#SBATCH --account=deadline +#SBATCH --qos=deadline #SBATCH --nodes=2 #SBATCH --gres=gpu:a40:4 #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --mem=0 -#SBATCH --output=multinode-%j.out -#SBATCH --error=multinode-%j.err #SBATCH --open-mode=append #SBATCH --wait-all-nodes=1 -#SBATCH --time=08:00:00 +#SBATCH --time=01:00:00 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out # load virtual environment source /ssd003/projects/aieng/envs/genssl2/bin/activate @@ -47,11 +47,11 @@ srun -p $SLURM_JOB_PARTITION \ --distributed_mode \ --batch-size=128 \ --epochs=200 \ - --experiment="simsiam_stablediff_p0p5_seed43" \ + --experiment="simsiam_icgan_seed43_bs128_rforig" \ --resume_from_checkpoint="/projects/imagenet_synthetic/model_checkpoints/_original_simsiam/checkpoint_0099.pth.tar" \ --seed=43 \ --use_synthetic_data \ - --synthetic_data_dir="/projects/imagenet_synthetic/arashaf_stablediff_batched" \ + --synthetic_data_dir="/projects/imagenet_synthetic/synthetic_icgan" \ --synthetic_index_min=0 \ - --synthetic_index_max=1 \ + --synthetic_index_max=4 \ --generative_augmentation_prob=0.5' \ No newline at end of file From 71b256b28ef6e497665f62b5474ef4ec5bc4e1de Mon Sep 17 00:00:00 2001 From: fforghani Date: Tue, 5 Mar 2024 12:00:18 -0500 Subject: [PATCH 15/38] Add food101 and places365 to linear evaluation script --- .../eval_food101_original_simsiam_100.slrm | 39 ++ .../eval_food101_simsiam_baseline_100.slrm | 40 ++ .../eval_food101_simsiam_icgan_100.slrm | 39 ++ .../eval_food101_simsiam_stablediff_100.slrm | 39 ++ .../eval_places365_original_simsiam_100.slrm | 39 ++ .../eval_places365_simsiam_baseline_100.slrm | 39 ++ .../eval_places365_simsiam_icgan_100.slrm | 39 ++ ...eval_places365_simsiam_stablediff_100.slrm | 39 ++ simsiam/linear_eval_downstream_datasets.py | 498 ++++++++++++++++++ 9 files changed, 811 insertions(+) create mode 100644 eval_scripts/food101/eval_food101_original_simsiam_100.slrm create mode 100644 eval_scripts/food101/eval_food101_simsiam_baseline_100.slrm create mode 100644 eval_scripts/food101/eval_food101_simsiam_icgan_100.slrm create mode 100644 eval_scripts/food101/eval_food101_simsiam_stablediff_100.slrm create mode 100644 eval_scripts/places365/eval_places365_original_simsiam_100.slrm create mode 100644 eval_scripts/places365/eval_places365_simsiam_baseline_100.slrm create mode 100644 eval_scripts/places365/eval_places365_simsiam_icgan_100.slrm create mode 100644 eval_scripts/places365/eval_places365_simsiam_stablediff_100.slrm create mode 100644 simsiam/linear_eval_downstream_datasets.py diff --git a/eval_scripts/food101/eval_food101_original_simsiam_100.slrm b/eval_scripts/food101/eval_food101_original_simsiam_100.slrm new file mode 100644 index 0000000..7f38bfc --- /dev/null +++ b/eval_scripts/food101/eval_food101_original_simsiam_100.slrm @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/food101/evaluate_original"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="food101" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/_original_simsiam/checkpoint_0099.pth.tar" diff --git a/eval_scripts/food101/eval_food101_simsiam_baseline_100.slrm b/eval_scripts/food101/eval_food101_simsiam_baseline_100.slrm new file mode 100644 index 0000000..1954f69 --- /dev/null +++ b/eval_scripts/food101/eval_food101_simsiam_baseline_100.slrm @@ -0,0 +1,40 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/food101/evaluate_baseline"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="food101" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-29-14-49/checkpoint_0099.pth.tar" +# --pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-29-14-49/checkpoint_0090.pth.tar" diff --git a/eval_scripts/food101/eval_food101_simsiam_icgan_100.slrm b/eval_scripts/food101/eval_food101_simsiam_icgan_100.slrm new file mode 100644 index 0000000..4545b1c --- /dev/null +++ b/eval_scripts/food101/eval_food101_simsiam_icgan_100.slrm @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/food101/evaluate_icgan"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="food101" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_2024-02-29-18-40/checkpoint_0099.pth.tar" diff --git a/eval_scripts/food101/eval_food101_simsiam_stablediff_100.slrm b/eval_scripts/food101/eval_food101_simsiam_stablediff_100.slrm new file mode 100644 index 0000000..5cc53fc --- /dev/null +++ b/eval_scripts/food101/eval_food101_simsiam_stablediff_100.slrm @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/food101/evaluate_stable_diff"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="food101" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-29-15-27/checkpoint_0099.pth.tar" diff --git a/eval_scripts/places365/eval_places365_original_simsiam_100.slrm b/eval_scripts/places365/eval_places365_original_simsiam_100.slrm new file mode 100644 index 0000000..a617e41 --- /dev/null +++ b/eval_scripts/places365/eval_places365_original_simsiam_100.slrm @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/places365/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/places365/evaluate_original"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="places365" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/_original_simsiam/checkpoint_0099.pth.tar" diff --git a/eval_scripts/places365/eval_places365_simsiam_baseline_100.slrm b/eval_scripts/places365/eval_places365_simsiam_baseline_100.slrm new file mode 100644 index 0000000..0160209 --- /dev/null +++ b/eval_scripts/places365/eval_places365_simsiam_baseline_100.slrm @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/places365/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/places365/evaluate_baseline"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="places365" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_2024-02-29-14-49/checkpoint_0099.pth.tar" \ No newline at end of file diff --git a/eval_scripts/places365/eval_places365_simsiam_icgan_100.slrm b/eval_scripts/places365/eval_places365_simsiam_icgan_100.slrm new file mode 100644 index 0000000..48975e1 --- /dev/null +++ b/eval_scripts/places365/eval_places365_simsiam_icgan_100.slrm @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/places365/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/places365/evaluate_icgan"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="places365" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_2024-02-29-18-40/checkpoint_0099.pth.tar" diff --git a/eval_scripts/places365/eval_places365_simsiam_stablediff_100.slrm b/eval_scripts/places365/eval_places365_simsiam_stablediff_100.slrm new file mode 100644 index 0000000..cd8de93 --- /dev/null +++ b/eval_scripts/places365/eval_places365_simsiam_stablediff_100.slrm @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH --job-name="simsiam_eval" +#SBATCH --partition=t4v2 +#SBATCH --account=deadline +#SBATCH --qos=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --time=36:00:00 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=4 +#SBATCH --mem-per-cpu=8G +#SBATCH --output=slurm-%j.out + + +PY_ARGS=${@:1} + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +export CUDA_LAUNCH_BLOCKING=1 + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=45679 + +export PYTHONPATH="." +nvidia-smi + +# “srun” executes the script times +srun python simsiam/linear_eval_downstream_datasets.py \ +--data_dir="/projects/imagenet_synthetic/fereshteh_datasets/places365/" \ +--checkpoint_dir="/projects/imagenet_synthetic/model_checkpoints/places365/evaluate_stable_diff"\ +--arch="resnet50" \ +--distributed_mode \ +--batch-size=1024 \ +--lars \ +--dataset_name="places365" \ +--pretrained_checkpoint="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_2024-02-29-15-27/checkpoint_0099.pth.tar" diff --git a/simsiam/linear_eval_downstream_datasets.py b/simsiam/linear_eval_downstream_datasets.py new file mode 100644 index 0000000..616b145 --- /dev/null +++ b/simsiam/linear_eval_downstream_datasets.py @@ -0,0 +1,498 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import math +import os +import random +import shutil +from functools import partial + +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +from torch import nn +from torch.backends import cudnn +from torch.nn.parallel import DistributedDataParallel as DDP # noqa: N817 +from torchvision import datasets, models, transforms +from tqdm import tqdm + +from SimCLR import distributed as dist_utils + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data_dir", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--num_workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=4096, + type=int, + metavar="N", + help="mini-batch size (default: 4096), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument( + "--distributed_mode", + action="store_true", + help="Enable distributed training", +) +parser.add_argument("--distributed_launcher", default="slurm") +parser.add_argument("--distributed_backend", default="nccl") +parser.add_argument( + "--seed", default=42, type=int, help="seed for initializing training. " +) +parser.add_argument( + "--pretrained_checkpoint", + default="", + type=str, + help="Path to simsiam pretrained checkpoint.", +) +parser.add_argument("--lars", action="store_true", help="Use LARS") +parser.add_argument( + "--checkpoint_dir", + default="", + help="Checkpoint directory to save eval model checkpoints.", +) +parser.add_argument("--dataset_name", default="imagenet", help="Name of the dataset.") + +best_acc1 = 0 + + +def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int) -> None: + """Initialize worker processes with a random seed. + + Parameters + ---------- + worker_id : int + ID of the worker process. + num_workers : int + Total number of workers that will be initialized. + rank : int + The rank of the current process. + seed : int + A random seed used determine the worker seed. + """ + worker_seed = num_workers * rank + worker_id + seed + torch.manual_seed(worker_seed) + random.seed(worker_seed) + + +def main(): + args = parser.parse_args() + global best_acc1 + + torch.multiprocessing.set_start_method("spawn") + if args.distributed_mode: + dist_utils.init_distributed_mode( + launcher=args.distributed_launcher, + backend=args.distributed_backend, + ) + device_id = torch.cuda.current_device() + else: + device_id = None + + # create model + print(f"Creating model {args.arch}") + model = models.__dict__[args.arch]() + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained_checkpoint: + if os.path.isfile(args.pretrained_checkpoint): + print(f"Loading checkpoint {args.pretrained_checkpoint}") + checkpoint = torch.load(args.pretrained_checkpoint, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith("module.encoder") and not k.startswith( + "module.encoder.fc" + ): + # remove prefix + state_dict[k[len("module.encoder.") :]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + else: + raise ValueError(f"No checkpoint found at: {args.pretrained_checkpoint}") + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size * 4 / 256 + + if args.distributed_mode and dist_utils.is_dist_avail_and_initialized(): + torch.cuda.set_device(device_id) + model = model.cuda(device_id) + model = DDP(model, device_ids=[device_id]) + else: + raise NotImplementedError("Only DistributedDataParallel is supported.") + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(device_id) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + optimizer = torch.optim.SGD( + parameters, init_lr, momentum=args.momentum, weight_decay=args.weight_decay + ) + if args.lars: + print("Use LARS optimizer.") + # from apex.parallel.LARC import LARC + from LARC import LARC + + optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) + + cudnn.benchmark = True + + # Data loading code + train_dir = os.path.join(args.data_dir, "train") + val_dir = os.path.join(args.data_dir, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + if args.dataset_name == "imagenet": + train_dataset = datasets.ImageFolder( + train_dir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + val_dataset = datasets.ImageFolder( + val_dir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ) + elif args.dataset_name == "food101": + train_dataset=datasets.Food101( + root=args.data_dir, + split="train", + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ),) + val_dataset=datasets.Food101( + root=args.data_dir, + split="test", + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ),) + elif args.dataset_name == "places365": + train_dataset=datasets.Places365( + root=args.data_dir, + split="train-standard", + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ),) + val_dataset=datasets.Places365( + root=args.data_dir, + split="val", + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ),) + + if dist_utils.is_dist_avail_and_initialized() and args.distributed_mode: + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, + seed=args.seed, + ) + else: + train_sampler = None + + init_fn = partial( + worker_init_fn, + num_workers=args.num_workers, + rank=dist_utils.get_rank(), + seed=args.seed, + ) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + sampler=train_sampler, + num_workers=args.num_workers, + worker_init_fn=init_fn, + pin_memory=True, # TODO(arashaf): this was set to false in training script. + ) + + val_loader = torch.utils.data.DataLoader( + val_dataset, + batch_size=256, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + ) + + for epoch in tqdm(range(args.epochs)): + print(f"Starting training epoch: {epoch}") + if dist_utils.is_dist_avail_and_initialized(): + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, device_id, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, device_id, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if args.checkpoint_dir and dist_utils.get_rank() == 0: + os.makedirs(args.checkpoint_dir, exist_ok=True) + checkpoint_name = "eval_checkpoint_{:04d}.pth.tar".format(epoch) + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch, + "arch": args.arch, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + checkpoint_file, + ) + if epoch == 0: + sanity_check(model.state_dict(), args.pretrained_checkpoint) + + +def train(train_loader, model, criterion, optimizer, epoch, device_id, args): + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + for images, target in tqdm(train_loader): + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +def validate(val_loader, model, criterion, device_id, args): + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + for images, target in tqdm(val_loader): + images = images.cuda(device_id, non_blocking=True) + target = target.cuda(device_id, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + print( + "Validation Accuracy@1 {top1.avg:.3f}, Accuracy@5 {top5.avg:.3f}".format( + top1=top1, top5=top5 + ) + ) + + return top1.avg + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + print(f"Saving checkpoint at: {filename}") + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") + + +def sanity_check(state_dict, pretrained_weights): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print(f"Loading {pretrained_weights} for sanity check") + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint["state_dict"] + + for k in list(state_dict.keys()): + # only ignore fc layer + if "fc.weight" in k or "fc.bias" in k: + continue + + # name in pretrained model + k_pre = ( + "module.encoder." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder." + k + ) + + assert ( + state_dict[k].cpu() == state_dict_pre[k_pre] + ).all(), "{} is changed in linear classifier training.".format(k) + + print("Sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group["lr"] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() From c2fcc808fdf534583e8758741163f3ae9536a448 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 6 Mar 2024 08:31:11 -0800 Subject: [PATCH 16/38] add original eval --- original_eval_simsiam.slrm | 46 +++ simsiam/linear_eval_original_code.py | 576 +++++++++++++++++++++++++++ 2 files changed, 622 insertions(+) create mode 100644 original_eval_simsiam.slrm create mode 100644 simsiam/linear_eval_original_code.py diff --git a/original_eval_simsiam.slrm b/original_eval_simsiam.slrm new file mode 100644 index 0000000..a9b6e1e --- /dev/null +++ b/original_eval_simsiam.slrm @@ -0,0 +1,46 @@ +#!/bin/bash + +#SBATCH --job-name="sana_eval" +#SBATCH --partition=t4v2 + +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=singlenode_stablediff_160_%j.out +#SBATCH --error=singlenode_stablediff_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/scratch/ssd004/datasets/imagenet256" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars --batch-size=2048 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py new file mode 100644 index 0000000..a2dc512 --- /dev/null +++ b/simsiam/linear_eval_original_code.py @@ -0,0 +1,576 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import builtins +import math +import os +import random +import shutil +import time +import warnings + +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +import torch.multiprocessing as mp +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models + +from tqdm import tqdm + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=4096, + type=int, + metavar="N", + help="mini-batch size (default: 4096), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') + +# additional configs: +parser.add_argument('--pretrained', default='', type=str, + help='path to simsiam pretrained checkpoint') +parser.add_argument('--lars', action='store_true', + help='Use LARS') + + +best_acc1 = 0 + + +def main(): + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + #NOTE: this line can reduce speed considerably + # cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + print(args.world_size) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + print("second", args.world_size) + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args, )) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + print("spawn performed, gpu", gpu, flush=True) + args.gpu = gpu + + # suppress printing if not master + if args.multiprocessing_distributed and args.gpu != 0: + def print_pass(*args,flush=True): + pass + builtins.print = print_pass + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu), flush=True) + + if args.distributed: + print("here", flush=True) + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + print("rank", args.rank, flush=True) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + print("second rank", args.rank, flush=True) + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + print("init_process_group", flush=True) + torch.distributed.barrier() + # create model + print("=> creating model '{}'".format(args.arch), flush=True) + model = models.__dict__[args.arch]() + + print("model", model.state_dict().keys(), flush=True) + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ['fc.weight', 'fc.bias']: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained: + if os.path.isfile(args.pretrained): + print("=> loading checkpoint '{}'".format(args.pretrained), flush=True) + checkpoint = torch.load(args.pretrained, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint['state_dict'] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith('module.encoder') and not k.startswith('module.encoder.fc'): + # remove prefix + state_dict[k[len("module.encoder."):]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + args.start_epoch = 0 + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + + print("=> loaded pre-trained model '{}'".format(args.pretrained)) + else: + print("=> no checkpoint found at '{}'".format(args.pretrained)) + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + args.batch_size = int(args.batch_size / ngpus_per_node) + print("batchsize",args.batch_size, flush=True) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + print("workers",args.workers, flush=True) + print("gpu",args.gpu, flush=True) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = torch.nn.DataParallel(model).cuda() + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + optimizer = torch.optim.SGD(parameters, init_lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + if args.lars: + print("=> use LARS optimizer.", flush=True) + from LARC import LARC + optimizer = LARC(optimizer=optimizer, trust_coefficient=.001, clip=False) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume), flush=True) + if args.gpu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch']), flush=True) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=256, shuffle=False, + num_workers=args.workers, pin_memory=True) + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + print("epoch", epoch, flush=True) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + }, is_best) + if epoch == args.start_epoch: + sanity_check(model.state_dict(), args.pretrained) + + +def train(train_loader, model, criterion, optimizer, epoch, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + end = time.time() + i = 0 + for images, target in tqdm(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + if i == 0: + print("first step passed", flush=True) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + i += 1 + + +def validate(val_loader, model, criterion, args): + batch_time = AverageMeter('Time', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + i = 0 + for images, target in tqdm(val_loader): + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + i += 1 + + # # TODO: this should also be done with the ProgressMeter + print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + + +def sanity_check(state_dict, pretrained_weights): + """ + Linear classifier should not change any weights other than the linear layer. + This sanity check asserts nothing wrong happens (e.g., BN stats updated). + """ + print("=> loading '{}' for sanity check".format(pretrained_weights)) + checkpoint = torch.load(pretrained_weights, map_location="cpu") + state_dict_pre = checkpoint['state_dict'] + + for k in list(state_dict.keys()): + # only ignore fc layer + if 'fc.weight' in k or 'fc.bias' in k: + continue + + # name in pretrained model + k_pre = 'module.encoder.' + k[len('module.'):] \ + if k.startswith('module.') else 'module.encoder.' + k + + assert ((state_dict[k].cpu() == state_dict_pre[k_pre]).all()), \ + '{} is changed in linear classifier training.'.format(k) + + print("=> sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries), flush=True) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group['lr'] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() \ No newline at end of file From 6c3f957e7fd30ae6e5c90f7248dc1e4810756d13 Mon Sep 17 00:00:00 2001 From: fforghani Date: Wed, 6 Mar 2024 13:17:17 -0500 Subject: [PATCH 17/38] Add food101 and cifar10 to linear eval code --- original_eval_scripts/CIFAR10/baseline.slrm | 47 +++++++ .../INaturalist/baseline.slrm | 47 +++++++ original_eval_scripts/food101/baseline.slrm | 47 +++++++ simsiam/linear_eval_original_code.py | 118 +++++++++++++++--- 4 files changed, 241 insertions(+), 18 deletions(-) create mode 100644 original_eval_scripts/CIFAR10/baseline.slrm create mode 100644 original_eval_scripts/INaturalist/baseline.slrm create mode 100644 original_eval_scripts/food101/baseline.slrm diff --git a/original_eval_scripts/CIFAR10/baseline.slrm b/original_eval_scripts/CIFAR10/baseline.slrm new file mode 100644 index 0000000..b655f3b --- /dev/null +++ b/original_eval_scripts/CIFAR10/baseline.slrm @@ -0,0 +1,47 @@ +#!/bin/bash + +#SBATCH --job-name="cifar" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=cifar10_baseline_160_%j.out +#SBATCH --error=cifar10_baseline_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="cifar10" \ No newline at end of file diff --git a/original_eval_scripts/INaturalist/baseline.slrm b/original_eval_scripts/INaturalist/baseline.slrm new file mode 100644 index 0000000..e5673f4 --- /dev/null +++ b/original_eval_scripts/INaturalist/baseline.slrm @@ -0,0 +1,47 @@ +#!/bin/bash + +#SBATCH --job-name="inaturalist" +#SBATCH --partition=t4v2 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=inaturalist_baseline_160_%j.out +#SBATCH --error=inaturalist_baseline_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="INaturalis" \ No newline at end of file diff --git a/original_eval_scripts/food101/baseline.slrm b/original_eval_scripts/food101/baseline.slrm new file mode 100644 index 0000000..83316cf --- /dev/null +++ b/original_eval_scripts/food101/baseline.slrm @@ -0,0 +1,47 @@ +#!/bin/bash + +#SBATCH --job-name="food101" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=food101_baseline_160_%j.out +#SBATCH --error=food101_baseline_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="food101" \ No newline at end of file diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index a2dc512..82db470 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -120,6 +120,7 @@ parser.add_argument('--lars', action='store_true', help='Use LARS') +parser.add_argument("--dataset_name", default="imagenet", help="Name of the dataset.") best_acc1 = 0 @@ -308,14 +309,100 @@ def print_pass(*args,flush=True): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - train_dataset = datasets.ImageFolder( - traindir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) + if args.dataset_name == "imagenet": + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + val_dataset = datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + elif args.dataset_name == "food101": + print("=> using food101 dataset.", flush=True) + train_dataset=datasets.Food101( + root=args.data, + split="train", + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ),) + val_dataset=datasets.Food101( + root=args.data, + split="test", + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ),) + elif args.dataset_name == "cifar10": + train_dataset = datasets.CIFAR10( + root=args.data, + train=True, + download=True, + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) + val_dataset = datasets.CIFAR10( + root=args.data, + train=False, + download=True, + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) + elif args.dataset_name == "INaturalist": + train_dataset = datasets.INaturalist( + root=args.data, + version="2018", + target_type="full", + mode="train", + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + val_dataset = datasets.INaturalist( + root=args.data_dir, + split="val", + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ) + if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) @@ -325,17 +412,12 @@ def print_pass(*args,flush=True): train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) - + val_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(valdir, transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])), - batch_size=256, shuffle=False, - num_workers=args.workers, pin_memory=True) - + val_dataset, + batch_size=256, shuffle=False, + num_workers=args.workers, pin_memory=True) + if args.evaluate: validate(val_loader, model, criterion, args) return From 0d176fb81c3dea7a3358b162c6e067ae8d260781 Mon Sep 17 00:00:00 2001 From: fforghani Date: Wed, 6 Mar 2024 16:57:23 -0500 Subject: [PATCH 18/38] Add places365 dataset to linear eval. --- original_eval_scripts/places365/baseline.slrm | 47 +++++++++++++++++++ simsiam/linear_eval_original_code.py | 29 +++++++++++- 2 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 original_eval_scripts/places365/baseline.slrm diff --git a/original_eval_scripts/places365/baseline.slrm b/original_eval_scripts/places365/baseline.slrm new file mode 100644 index 0000000..5f9e4d4 --- /dev/null +++ b/original_eval_scripts/places365/baseline.slrm @@ -0,0 +1,47 @@ +#!/bin/bash + +#SBATCH --job-name="places365" +#SBATCH --partition=rtx6000 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=places365_baseline_160_%j.out +#SBATCH --error=places365_baseline_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets/places365" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="places365" \ No newline at end of file diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 82db470..394626e 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -318,12 +318,14 @@ def print_pass(*args,flush=True): transforms.ToTensor(), normalize, ])) - val_dataset = datasets.ImageFolder(valdir, transforms.Compose([ + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, - ])), + ])) elif args.dataset_name == "food101": print("=> using food101 dataset.", flush=True) train_dataset=datasets.Food101( @@ -375,6 +377,29 @@ def print_pass(*args,flush=True): ], ), ) + elif args.dataset_name == "places365": + train_dataset=datasets.Places365( + root=args.data, + split="train-standard", + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ),) + val_dataset=datasets.Places365( + root=args.data, + split="val", + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ),) elif args.dataset_name == "INaturalist": train_dataset = datasets.INaturalist( root=args.data, From 5ca2ff4376817c16dda8bea82620480413fa0bd3 Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Wed, 6 Mar 2024 17:01:38 -0500 Subject: [PATCH 19/38] minor changes. --- simsiam/linear_eval_original_code.py | 375 ++++++++++++++++----------- 1 file changed, 229 insertions(+), 146 deletions(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 82db470..460cd68 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -90,35 +90,64 @@ help="weight decay (default: 0.)", dest="weight_decay", ) -parser.add_argument('-p', '--print-freq', default=10, type=int, - metavar='N', help='print frequency (default: 10)') -parser.add_argument('--resume', default='', type=str, metavar='PATH', - help='path to latest checkpoint (default: none)') -parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', - help='evaluate model on validation set') -parser.add_argument('--world-size', default=-1, type=int, - help='number of nodes for distributed training') -parser.add_argument('--rank', default=-1, type=int, - help='node rank for distributed training') -parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, - help='url used to set up distributed training') -parser.add_argument('--dist-backend', default='nccl', type=str, - help='distributed backend') -parser.add_argument('--seed', default=None, type=int, - help='seed for initializing training. ') -parser.add_argument('--gpu', default=None, type=int, - help='GPU id to use.') -parser.add_argument('--multiprocessing-distributed', action='store_true', - help='Use multi-processing distributed training to launch ' - 'N processes per node, which has N GPUs. This is the ' - 'fastest way to use PyTorch for either single node or ' - 'multi node data parallel training') +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) +parser.add_argument( + "--resume", + default="", + type=str, + metavar="PATH", + help="path to latest checkpoint (default: none)", +) +parser.add_argument( + "-e", + "--evaluate", + dest="evaluate", + action="store_true", + help="evaluate model on validation set", +) +parser.add_argument( + "--world-size", + default=-1, + type=int, + help="number of nodes for distributed training", +) +parser.add_argument( + "--rank", default=-1, type=int, help="node rank for distributed training" +) +parser.add_argument( + "--dist-url", + default="tcp://224.66.41.62:23456", + type=str, + help="url used to set up distributed training", +) +parser.add_argument( + "--dist-backend", default="nccl", type=str, help="distributed backend" +) +parser.add_argument( + "--seed", default=None, type=int, help="seed for initializing training. " +) +parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.") +parser.add_argument( + "--multiprocessing-distributed", + action="store_true", + help="Use multi-processing distributed training to launch " + "N processes per node, which has N GPUs. This is the " + "fastest way to use PyTorch for either single node or " + "multi node data parallel training", +) # additional configs: -parser.add_argument('--pretrained', default='', type=str, - help='path to simsiam pretrained checkpoint') -parser.add_argument('--lars', action='store_true', - help='Use LARS') +parser.add_argument( + "--pretrained", default="", type=str, help="path to simsiam pretrained checkpoint" +) +parser.add_argument("--lars", action="store_true", help="Use LARS") parser.add_argument("--dataset_name", default="imagenet", help="Name of the dataset.") @@ -131,17 +160,21 @@ def main(): if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) - #NOTE: this line can reduce speed considerably + # NOTE: this line can reduce speed considerably # cudnn.deterministic = True - warnings.warn('You have chosen to seed training. ' - 'This will turn on the CUDNN deterministic setting, ' - 'which can slow down your training considerably! ' - 'You may see unexpected behavior when restarting ' - 'from checkpoints.') + warnings.warn( + "You have chosen to seed training. " + "This will turn on the CUDNN deterministic setting, " + "which can slow down your training considerably! " + "You may see unexpected behavior when restarting " + "from checkpoints." + ) if args.gpu is not None: - warnings.warn('You have chosen a specific GPU. This will completely ' - 'disable data parallelism.') + warnings.warn( + "You have chosen a specific GPU. This will completely " + "disable data parallelism." + ) if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) @@ -157,7 +190,14 @@ def main(): print("second", args.world_size) # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function - mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args, )) + mp.spawn( + main_worker, + nprocs=ngpus_per_node, + args=( + ngpus_per_node, + args, + ), + ) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args) @@ -170,8 +210,10 @@ def main_worker(gpu, ngpus_per_node, args): # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: - def print_pass(*args,flush=True): + + def print_pass(*args, flush=True): pass + builtins.print = print_pass if args.gpu is not None: @@ -187,8 +229,12 @@ def print_pass(*args,flush=True): # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu print("second rank", args.rank, flush=True) - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) + dist.init_process_group( + backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, + ) print("init_process_group", flush=True) torch.distributed.barrier() # create model @@ -199,7 +245,7 @@ def print_pass(*args,flush=True): # freeze all layers but the last fc for name, param in model.named_parameters(): - if name not in ['fc.weight', 'fc.bias']: + if name not in ["fc.weight", "fc.bias"]: param.requires_grad = False # init the fc layer model.fc.weight.data.normal_(mean=0.0, std=0.01) @@ -212,12 +258,14 @@ def print_pass(*args,flush=True): checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys - state_dict = checkpoint['state_dict'] + state_dict = checkpoint["state_dict"] for k in list(state_dict.keys()): # retain only encoder up to before the embedding layer - if k.startswith('module.encoder') and not k.startswith('module.encoder.fc'): + if k.startswith("module.encoder") and not k.startswith( + "module.encoder.fc" + ): # remove prefix - state_dict[k[len("module.encoder."):]] = state_dict[k] + state_dict[k[len("module.encoder.") :]] = state_dict[k] # delete renamed or unused k del state_dict[k] @@ -243,11 +291,13 @@ def print_pass(*args,flush=True): # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) - print("batchsize",args.batch_size, flush=True) + print("batchsize", args.batch_size, flush=True) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) - print("workers",args.workers, flush=True) - print("gpu",args.gpu, flush=True) - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + print("workers", args.workers, flush=True) + print("gpu", args.gpu, flush=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.gpu] + ) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all @@ -258,7 +308,7 @@ def print_pass(*args,flush=True): model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs - if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + if args.arch.startswith("alexnet") or args.arch.startswith("vgg"): model.features = torch.nn.DataParallel(model.features) model.cuda() else: @@ -271,13 +321,14 @@ def print_pass(*args,flush=True): parameters = list(filter(lambda p: p.requires_grad, model.parameters())) assert len(parameters) == 2 # fc.weight, fc.bias - optimizer = torch.optim.SGD(parameters, init_lr, - momentum=args.momentum, - weight_decay=args.weight_decay) + optimizer = torch.optim.SGD( + parameters, init_lr, momentum=args.momentum, weight_decay=args.weight_decay + ) if args.lars: print("=> use LARS optimizer.", flush=True) from LARC import LARC - optimizer = LARC(optimizer=optimizer, trust_coefficient=.001, clip=False) + + optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) # optionally resume from a checkpoint if args.resume: @@ -287,48 +338,63 @@ def print_pass(*args,flush=True): checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. - loc = 'cuda:{}'.format(args.gpu) + loc = "cuda:{}".format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) - args.start_epoch = checkpoint['epoch'] - best_acc1 = checkpoint['best_acc1'] + args.start_epoch = checkpoint["epoch"] + best_acc1 = checkpoint["best_acc1"] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - print("=> loaded checkpoint '{}' (epoch {})" - .format(args.resume, checkpoint['epoch']), flush=True) + model.load_state_dict(checkpoint["state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + print( + "=> loaded checkpoint '{}' (epoch {})".format( + args.resume, checkpoint["epoch"] + ), + flush=True, + ) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code - traindir = os.path.join(args.data, 'train') - valdir = os.path.join(args.data, 'val') - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) + traindir = os.path.join(args.data, "train") + valdir = os.path.join(args.data, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) if args.dataset_name == "imagenet": train_dataset = datasets.ImageFolder( traindir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - val_dataset = datasets.ImageFolder(valdir, transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])), + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ), + ) + val_dataset = ( + datasets.ImageFolder( + valdir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ), + ), + ) elif args.dataset_name == "food101": print("=> using food101 dataset.", flush=True) - train_dataset=datasets.Food101( - root=args.data, - split="train", + train_dataset = datasets.Food101( + root=args.data, + split="train", transform=transforms.Compose( [ transforms.RandomResizedCrop(224), @@ -336,10 +402,11 @@ def print_pass(*args,flush=True): transforms.ToTensor(), normalize, ], - ),) - val_dataset=datasets.Food101( - root=args.data, - split="test", + ), + ) + val_dataset = datasets.Food101( + root=args.data, + split="test", transform=transforms.Compose( [ transforms.Resize(256), @@ -347,12 +414,13 @@ def print_pass(*args,flush=True): transforms.ToTensor(), normalize, ], - ),) + ), + ) elif args.dataset_name == "cifar10": train_dataset = datasets.CIFAR10( - root=args.data, - train=True, - download=True, + root=args.data, + train=True, + download=True, transform=transforms.Compose( [ transforms.RandomResizedCrop(224), @@ -363,9 +431,9 @@ def print_pass(*args,flush=True): ), ) val_dataset = datasets.CIFAR10( - root=args.data, - train=False, - download=True, + root=args.data, + train=False, + download=True, transform=transforms.Compose( [ transforms.Resize(256), @@ -377,7 +445,7 @@ def print_pass(*args,flush=True): ) elif args.dataset_name == "INaturalist": train_dataset = datasets.INaturalist( - root=args.data, + root=args.data, version="2018", target_type="full", mode="train", @@ -391,8 +459,8 @@ def print_pass(*args,flush=True): ), ) val_dataset = datasets.INaturalist( - root=args.data_dir, - split="val", + root=args.data_dir, + split="val", transform=transforms.Compose( [ transforms.Resize(256), @@ -402,7 +470,6 @@ def print_pass(*args,flush=True): ], ), ) - if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) @@ -410,14 +477,22 @@ def print_pass(*args,flush=True): train_sampler = None train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler) - + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + num_workers=args.workers, + pin_memory=True, + sampler=train_sampler, + ) + val_loader = torch.utils.data.DataLoader( - val_dataset, - batch_size=256, shuffle=False, - num_workers=args.workers, pin_memory=True) - + val_dataset, + batch_size=256, + shuffle=False, + num_workers=args.workers, + pin_memory=True, + ) + if args.evaluate: validate(val_loader, model, criterion, args) return @@ -439,29 +514,34 @@ def print_pass(*args,flush=True): is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) - if not args.multiprocessing_distributed or (args.multiprocessing_distributed - and args.rank % ngpus_per_node == 0): - save_checkpoint({ - 'epoch': epoch + 1, - 'arch': args.arch, - 'state_dict': model.state_dict(), - 'best_acc1': best_acc1, - 'optimizer' : optimizer.state_dict(), - }, is_best) + if not args.multiprocessing_distributed or ( + args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 + ): + save_checkpoint( + { + "epoch": epoch + 1, + "arch": args.arch, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + ) if epoch == args.start_epoch: sanity_check(model.state_dict(), args.pretrained) def train(train_loader, model, criterion, optimizer, epoch, args): - batch_time = AverageMeter('Time', ':6.3f') - data_time = AverageMeter('Data', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], - prefix="Epoch: [{}]".format(epoch)) + prefix="Epoch: [{}]".format(epoch), + ) """ Switch to eval mode: @@ -506,19 +586,18 @@ def train(train_loader, model, criterion, optimizer, epoch, args): if i % args.print_freq == 0: progress.display(i) - + i += 1 def validate(val_loader, model, criterion, args): - batch_time = AverageMeter('Time', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') + batch_time = AverageMeter("Time", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( - len(val_loader), - [batch_time, losses, top1, top5], - prefix='Test: ') + len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " + ) # switch to evaluate mode model.eval() @@ -551,16 +630,17 @@ def validate(val_loader, model, criterion, args): i += 1 # # TODO: this should also be done with the ProgressMeter - print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' - .format(top1=top1, top5=top5)) + print( + " * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5) + ) return top1.avg -def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): torch.save(state, filename) if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') + shutil.copyfile(filename, "model_best.pth.tar") def sanity_check(state_dict, pretrained_weights): @@ -570,26 +650,31 @@ def sanity_check(state_dict, pretrained_weights): """ print("=> loading '{}' for sanity check".format(pretrained_weights)) checkpoint = torch.load(pretrained_weights, map_location="cpu") - state_dict_pre = checkpoint['state_dict'] + state_dict_pre = checkpoint["state_dict"] for k in list(state_dict.keys()): # only ignore fc layer - if 'fc.weight' in k or 'fc.bias' in k: + if "fc.weight" in k or "fc.bias" in k: continue # name in pretrained model - k_pre = 'module.encoder.' + k[len('module.'):] \ - if k.startswith('module.') else 'module.encoder.' + k + k_pre = ( + "module.encoder." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder." + k + ) - assert ((state_dict[k].cpu() == state_dict_pre[k_pre]).all()), \ - '{} is changed in linear classifier training.'.format(k) + assert ( + state_dict[k].cpu() == state_dict_pre[k_pre] + ).all(), "{} is changed in linear classifier training.".format(k) print("=> sanity check passed.") class AverageMeter(object): """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): + + def __init__(self, name, fmt=":f"): self.name = name self.fmt = fmt self.reset() @@ -607,10 +692,10 @@ def update(self, val, n=1): self.avg = self.sum / self.count def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" return fmtstr.format(**self.__dict__) - - + + class ProgressMeter(object): def __init__(self, num_batches, meters, prefix=""): self.batch_fmtstr = self._get_batch_fmtstr(num_batches) @@ -620,21 +705,19 @@ def __init__(self, num_batches, meters, prefix=""): def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] - print('\t'.join(entries), flush=True) + print("\t".join(entries), flush=True) def _get_batch_fmtstr(self, num_batches): num_digits = len(str(num_batches // 1)) - fmt = '{:' + str(num_digits) + 'd}' - return '[' + fmt + '/' + fmt.format(num_batches) + ']' - - + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" def adjust_learning_rate(optimizer, init_lr, epoch, args): """Decay the learning rate based on schedule""" - cur_lr = init_lr * 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) for param_group in optimizer.param_groups: - param_group['lr'] = cur_lr + param_group["lr"] = cur_lr def accuracy(output, target, topk=(1,)): @@ -654,5 +737,5 @@ def accuracy(output, target, topk=(1,)): return res -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() From 582bc4dfc66e71dfb883244d91ae782619fc013b Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Wed, 6 Mar 2024 17:14:47 -0500 Subject: [PATCH 20/38] Fix error. --- simsiam/linear_eval_original_code.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 351544f..4128179 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -164,10 +164,7 @@ def main(): args = parser.parse_args() current_time = datetime.now().strftime("%Y-%m-%d-%H-%M") - checkpoint_subdir = ( - f"{args.experiment}_{current_time}" if args.experiment else f"{current_time}" - ) - args.checkpoint_dir = os.path.join(args.checkpoint_dir, checkpoint_subdir) + args.checkpoint_dir = os.path.join(args.checkpoint_dir, f"eval_{current_time}") os.makedirs(args.checkpoint_dir, exist_ok=True) print(args) From 2fccb3a2393e592d5dd2acd3c85f44c72c138b00 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 6 Mar 2024 14:39:45 -0800 Subject: [PATCH 21/38] add inaturalist --- .../INaturalist/baseline.slrm | 2 +- simsiam/inatural_dataset.py | 77 +++++++++++++++++++ simsiam/linear_eval_original_code.py | 12 +-- 3 files changed, 84 insertions(+), 7 deletions(-) create mode 100644 simsiam/inatural_dataset.py diff --git a/original_eval_scripts/INaturalist/baseline.slrm b/original_eval_scripts/INaturalist/baseline.slrm index e5673f4..8254c7a 100644 --- a/original_eval_scripts/INaturalist/baseline.slrm +++ b/original_eval_scripts/INaturalist/baseline.slrm @@ -33,7 +33,7 @@ export PYTHONPATH="." nvidia-smi python simsiam/linear_eval_original_code.py \ - --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --data="/datasets/inat_comp/2018/" \ --arch="resnet50" \ --multiprocessing-distributed \ --lars \ diff --git a/simsiam/inatural_dataset.py b/simsiam/inatural_dataset.py new file mode 100644 index 0000000..42a46f1 --- /dev/null +++ b/simsiam/inatural_dataset.py @@ -0,0 +1,77 @@ +import torch.utils.data as data +from PIL import Image +import os +import json +from torchvision import transforms +import random +import numpy as np + + +def default_loader(path): + return Image.open(path).convert('RGB') + +def load_taxonomy(ann_data, tax_levels, classes): + # loads the taxonomy data and converts to ints + taxonomy = {} + + if 'categories' in ann_data.keys(): + num_classes = len(ann_data['categories']) + for tt in tax_levels: + tax_data = [aa[tt] for aa in ann_data['categories']] + _, tax_id = np.unique(tax_data, return_inverse=True) + taxonomy[tt] = dict(zip(range(num_classes), list(tax_id))) + else: + # set up dummy data + for tt in tax_levels: + taxonomy[tt] = dict(zip([0], [0])) + + # create a dictionary of lists containing taxonomic labels + classes_taxonomic = {} + for cc in np.unique(classes): + tax_ids = [0]*len(tax_levels) + for ii, tt in enumerate(tax_levels): + tax_ids[ii] = taxonomy[tt][cc] + classes_taxonomic[cc] = tax_ids + + return taxonomy, classes_taxonomic + + +class INAT(data.Dataset): + def __init__(self, root, ann_file, transform): + + # load annotations + print('Loading annotations from: ' + os.path.basename(ann_file)) + with open(ann_file) as data_file: + ann_data = json.load(data_file) + + # set up the filenames and annotations + self.imgs = [aa['file_name'] for aa in ann_data['images']] + self.ids = [aa['id'] for aa in ann_data['images']] + + # if we dont have class labels set them to '0' + if 'annotations' in ann_data.keys(): + self.classes = [aa['category_id'] for aa in ann_data['annotations']] + else: + self.classes = [0]*len(self.imgs) + + # print out some stats + print('\t' + str(len(self.imgs)) + ' images') + print('\t' + str(len(set(self.classes))) + ' classes') + + self.root = root + self.loader = default_loader + + # augmentation params + self.transform = transform + + def __getitem__(self, index): + path = self.root + self.imgs[index] + img = self.loader(path) + species_id = self.classes[index] + + img = self.transform(img) + + return img, species_id + + def __len__(self): + return len(self.imgs) \ No newline at end of file diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 4128179..77144d7 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -29,6 +29,8 @@ import torchvision.transforms as transforms from tqdm import tqdm +from inatural_dataset import INAT + model_names = sorted( name @@ -479,11 +481,9 @@ def print_pass(*args, flush=True): ), ) elif args.dataset_name == "INaturalist": - train_dataset = datasets.INaturalist( + train_dataset = INAT( root=args.data, - version="2018", - target_type="full", - mode="train", + ann_file=os.path.join(args.data, "train2018.json"), transform=transforms.Compose( [ transforms.RandomResizedCrop(224), @@ -494,8 +494,8 @@ def print_pass(*args, flush=True): ), ) val_dataset = datasets.INaturalist( - root=args.data_dir, - split="val", + root=args.data, + ann_file=os.path.join(args.data, "val2018.json"), transform=transforms.Compose( [ transforms.Resize(256), From 0bc3f3f7db9b9de05c31197cba7770d8ad74e4e1 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 6 Mar 2024 14:46:57 -0800 Subject: [PATCH 22/38] fix inat --- simsiam/linear_eval_original_code.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 77144d7..32a6ad7 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -493,7 +493,7 @@ def print_pass(*args, flush=True): ], ), ) - val_dataset = datasets.INaturalist( + val_dataset = INAT( root=args.data, ann_file=os.path.join(args.data, "val2018.json"), transform=transforms.Compose( From a4a8ef97aec9c2cf2b08e4d9d46af59ac668f146 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 6 Mar 2024 14:58:25 -0800 Subject: [PATCH 23/38] fix inaturalist script --- original_eval_scripts/INaturalist/baseline.slrm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/original_eval_scripts/INaturalist/baseline.slrm b/original_eval_scripts/INaturalist/baseline.slrm index 8254c7a..dfc32fc 100644 --- a/original_eval_scripts/INaturalist/baseline.slrm +++ b/original_eval_scripts/INaturalist/baseline.slrm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name="inaturalist" -#SBATCH --partition=t4v2 +#SBATCH --partition=a40 #SBATCH --qos=deadline #SBATCH --account=deadline #SBATCH --nodes=1 @@ -44,4 +44,4 @@ python simsiam/linear_eval_original_code.py \ --rank 0 \ --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="INaturalis" \ No newline at end of file + --dataset_name="INaturalist" \ No newline at end of file From 06d77dd7f655c9e23dc3c282f3a2d89aedcbd57a Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 6 Mar 2024 15:13:54 -0800 Subject: [PATCH 24/38] update class head --- original_eval_scripts/CIFAR10/baseline.slrm | 3 ++- original_eval_scripts/INaturalist/baseline.slrm | 3 ++- original_eval_scripts/food101/baseline.slrm | 3 ++- original_eval_scripts/places365/baseline.slrm | 3 ++- simsiam/linear_eval_original_code.py | 9 +++++++++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/original_eval_scripts/CIFAR10/baseline.slrm b/original_eval_scripts/CIFAR10/baseline.slrm index b655f3b..5a1ae90 100644 --- a/original_eval_scripts/CIFAR10/baseline.slrm +++ b/original_eval_scripts/CIFAR10/baseline.slrm @@ -44,4 +44,5 @@ python simsiam/linear_eval_original_code.py \ --rank 0 \ --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar10" \ No newline at end of file + --dataset_name="cifar10" \ + --num_classes=10 \ No newline at end of file diff --git a/original_eval_scripts/INaturalist/baseline.slrm b/original_eval_scripts/INaturalist/baseline.slrm index dfc32fc..d7c0acb 100644 --- a/original_eval_scripts/INaturalist/baseline.slrm +++ b/original_eval_scripts/INaturalist/baseline.slrm @@ -44,4 +44,5 @@ python simsiam/linear_eval_original_code.py \ --rank 0 \ --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="INaturalist" \ No newline at end of file + --dataset_name="INaturalist" \ + --num_classes=8142 \ No newline at end of file diff --git a/original_eval_scripts/food101/baseline.slrm b/original_eval_scripts/food101/baseline.slrm index 83316cf..c4b9604 100644 --- a/original_eval_scripts/food101/baseline.slrm +++ b/original_eval_scripts/food101/baseline.slrm @@ -44,4 +44,5 @@ python simsiam/linear_eval_original_code.py \ --rank 0 \ --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="food101" \ No newline at end of file + --dataset_name="food101" \ + --num_classes=101 \ No newline at end of file diff --git a/original_eval_scripts/places365/baseline.slrm b/original_eval_scripts/places365/baseline.slrm index 5f9e4d4..b5854a7 100644 --- a/original_eval_scripts/places365/baseline.slrm +++ b/original_eval_scripts/places365/baseline.slrm @@ -44,4 +44,5 @@ python simsiam/linear_eval_original_code.py \ --rank 0 \ --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="places365" \ No newline at end of file + --dataset_name="places365" \ + --num_classes=434 \ No newline at end of file diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 32a6ad7..31a7e6e 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -160,6 +160,13 @@ help="Checkpoint root directory.", ) +parser.add_argument( + "--num_classes", + default=1000, + type=int, + help="Number of classes in the dataset.", +) + best_acc1 = 0 @@ -255,6 +262,8 @@ def print_pass(*args, flush=True): print("=> creating model '{}'".format(args.arch), flush=True) model = models.__dict__[args.arch]() + model.fc = nn.Linear(2048, args.num_classes) + print("model", model.state_dict().keys(), flush=True) # freeze all layers but the last fc From 6d3584b8407d747ea5e25f779faa47300c51d6ed Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Wed, 6 Mar 2024 16:15:27 -0800 Subject: [PATCH 25/38] add cifar100 --- original_eval_scripts/CIFAR100/baseline.slrm | 48 ++++++++++++++++++++ simsiam/linear_eval_original_code.py | 25 ++++++++++ 2 files changed, 73 insertions(+) create mode 100644 original_eval_scripts/CIFAR100/baseline.slrm diff --git a/original_eval_scripts/CIFAR100/baseline.slrm b/original_eval_scripts/CIFAR100/baseline.slrm new file mode 100644 index 0000000..50f3791 --- /dev/null +++ b/original_eval_scripts/CIFAR100/baseline.slrm @@ -0,0 +1,48 @@ +#!/bin/bash + +#SBATCH --job-name="cifar" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=cifar10_baseline_160_%j.out +#SBATCH --error=cifar10_baseline_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="cifar100" \ + --num_classes=100 \ No newline at end of file diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 31a7e6e..93f0317 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -464,6 +464,31 @@ def print_pass(*args, flush=True): ], ), ) + elif args.dataset_name == "cifar100": + train_dataset = datasets.CIFAR100( + root=args.data, + train=True, + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) + val_dataset = datasets.CIFAR100( + root=args.data, + train=False, + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) elif args.dataset_name == "places365": train_dataset = datasets.Places365( root=args.data, From c98b3e6436cda4b3422942a7198e9555ee83e128 Mon Sep 17 00:00:00 2001 From: Arash Afkanpour Date: Thu, 7 Mar 2024 04:25:44 -0500 Subject: [PATCH 26/38] minor changes --- original_eval_scripts/CIFAR10/baseline.slrm | 7 ++++--- original_eval_scripts/food101/baseline.slrm | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/original_eval_scripts/CIFAR10/baseline.slrm b/original_eval_scripts/CIFAR10/baseline.slrm index b655f3b..7d547b4 100644 --- a/original_eval_scripts/CIFAR10/baseline.slrm +++ b/original_eval_scripts/CIFAR10/baseline.slrm @@ -9,8 +9,7 @@ #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=32 #SBATCH --mem=0 -#SBATCH --output=cifar10_baseline_160_%j.out -#SBATCH --error=cifar10_baseline_160_%j.err +#SBATCH --output=slurm-cifar10_baseline_160_%j.out #SBATCH --open-mode=append #SBATCH --wait-all-nodes=1 #SBATCH --time=72:00:00 @@ -44,4 +43,6 @@ python simsiam/linear_eval_original_code.py \ --rank 0 \ --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_baseline_seed43_bs128_rforig_2024-03-05-12-27/checkpoint_0160.pth.tar"\ --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ - --dataset_name="cifar10" \ No newline at end of file + --dataset_name="cifar10" + + diff --git a/original_eval_scripts/food101/baseline.slrm b/original_eval_scripts/food101/baseline.slrm index 83316cf..270e248 100644 --- a/original_eval_scripts/food101/baseline.slrm +++ b/original_eval_scripts/food101/baseline.slrm @@ -9,8 +9,7 @@ #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=32 #SBATCH --mem=0 -#SBATCH --output=food101_baseline_160_%j.out -#SBATCH --error=food101_baseline_160_%j.err +#SBATCH --output=slurm-food101_baseline_160_%j.out #SBATCH --open-mode=append #SBATCH --wait-all-nodes=1 #SBATCH --time=72:00:00 From 8c34ca121a809d7325a77cd8c7f8dd07764a2a4f Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 12:01:50 -0700 Subject: [PATCH 27/38] add icgan ablation --- original_eval_scripts/imagenet/baseline.slrm | 46 ++++++++ simsiam/linear_eval_original_code.py | 105 +++++++++++-------- 2 files changed, 110 insertions(+), 41 deletions(-) create mode 100644 original_eval_scripts/imagenet/baseline.slrm diff --git a/original_eval_scripts/imagenet/baseline.slrm b/original_eval_scripts/imagenet/baseline.slrm new file mode 100644 index 0000000..a9b6e1e --- /dev/null +++ b/original_eval_scripts/imagenet/baseline.slrm @@ -0,0 +1,46 @@ +#!/bin/bash + +#SBATCH --job-name="sana_eval" +#SBATCH --partition=t4v2 + +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=singlenode_stablediff_160_%j.out +#SBATCH --error=singlenode_stablediff_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/scratch/ssd004/datasets/imagenet256" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars --batch-size=2048 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 93f0317..0007e08 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -28,6 +28,7 @@ import torchvision.models as models import torchvision.transforms as transforms from tqdm import tqdm +from icgan.data_utils import utils as data_utils from inatural_dataset import INAT @@ -167,6 +168,14 @@ help="Number of classes in the dataset.", ) +parser.add_argument( + "--ablation_mode", + default="ICGAN", + type=str, + help="Using ICGAN or stable diffusion feature extractor for ablation study.", + hint="ICGAN or stable_diffusion", +) + best_acc1 = 0 @@ -258,47 +267,61 @@ def print_pass(*args, flush=True): ) print("init_process_group", flush=True) torch.distributed.barrier() - # create model - print("=> creating model '{}'".format(args.arch), flush=True) - model = models.__dict__[args.arch]() - - model.fc = nn.Linear(2048, args.num_classes) - - print("model", model.state_dict().keys(), flush=True) - - # freeze all layers but the last fc - for name, param in model.named_parameters(): - if name not in ["fc.weight", "fc.bias"]: - param.requires_grad = False - # init the fc layer - model.fc.weight.data.normal_(mean=0.0, std=0.01) - model.fc.bias.data.zero_() - - # load from pre-trained, before DistributedDataParallel constructor - if args.pretrained: - if os.path.isfile(args.pretrained): - print("=> loading checkpoint '{}'".format(args.pretrained), flush=True) - checkpoint = torch.load(args.pretrained, map_location="cpu") - - # rename moco pre-trained keys - state_dict = checkpoint["state_dict"] - for k in list(state_dict.keys()): - # retain only encoder up to before the embedding layer - if k.startswith("module.encoder") and not k.startswith( - "module.encoder.fc" - ): - # remove prefix - state_dict[k[len("module.encoder.") :]] = state_dict[k] - # delete renamed or unused k - del state_dict[k] - - args.start_epoch = 0 - msg = model.load_state_dict(state_dict, strict=False) - assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} - - print("=> loaded pre-trained model '{}'".format(args.pretrained)) - else: - print("=> no checkpoint found at '{}'".format(args.pretrained)) + + if args.ablation_mode == "ICGAN": + model = data_utils.load_pretrained_feature_extractor( + args.pretrained, feature_extractor="selfsupervised" + ) + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + else: + # create model + print("=> creating model '{}'".format(args.arch), flush=True) + model = models.__dict__[args.arch]() + + model.fc = nn.Linear(2048, args.num_classes) + + print("model", model.state_dict().keys(), flush=True) + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained: + if os.path.isfile(args.pretrained): + print("=> loading checkpoint '{}'".format(args.pretrained), flush=True) + checkpoint = torch.load(args.pretrained, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if k.startswith("module.encoder") and not k.startswith( + "module.encoder.fc" + ): + # remove prefix + state_dict[k[len("module.encoder.") :]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + + args.start_epoch = 0 + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + + print("=> loaded pre-trained model '{}'".format(args.pretrained)) + else: + print("=> no checkpoint found at '{}'".format(args.pretrained)) # infer learning rate before changing batch size init_lr = args.lr * args.batch_size / 256 From 27311ae574aecba5d1d198c7908539e3e5c6da6b Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 12:26:44 -0700 Subject: [PATCH 28/38] add icgan ablation --- original_eval_scripts/CIFAR10/icgan_ab.slrm | 49 +++++++++++++++++++ original_eval_scripts/CIFAR100/baseline.slrm | 4 +- original_eval_scripts/CIFAR100/icgan_ab.slrm | 49 +++++++++++++++++++ .../INaturalist/icgan_ab.slrm | 49 +++++++++++++++++++ original_eval_scripts/food101/icgan_ab.slrm | 49 +++++++++++++++++++ original_eval_scripts/imagenet/icgan_ab.slrm | 46 +++++++++++++++++ original_eval_scripts/places365/icgan_ab.slrm | 49 +++++++++++++++++++ simsiam/linear_eval_original_code.py | 9 ++-- 8 files changed, 298 insertions(+), 6 deletions(-) create mode 100644 original_eval_scripts/CIFAR10/icgan_ab.slrm create mode 100644 original_eval_scripts/CIFAR100/icgan_ab.slrm create mode 100644 original_eval_scripts/INaturalist/icgan_ab.slrm create mode 100644 original_eval_scripts/food101/icgan_ab.slrm create mode 100644 original_eval_scripts/imagenet/icgan_ab.slrm create mode 100644 original_eval_scripts/places365/icgan_ab.slrm diff --git a/original_eval_scripts/CIFAR10/icgan_ab.slrm b/original_eval_scripts/CIFAR10/icgan_ab.slrm new file mode 100644 index 0000000..09558fa --- /dev/null +++ b/original_eval_scripts/CIFAR10/icgan_ab.slrm @@ -0,0 +1,49 @@ +#!/bin/bash + +#SBATCH --job-name="cifar" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=icgan_ab_cifar10_%j.out +#SBATCH --error=icgan_ab_cifar10_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/ssd003/projects/aieng/genssl/swav_pretrained.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="cifar10" \ + --num_classes=10 \ + --ablation_mode="icgan" diff --git a/original_eval_scripts/CIFAR100/baseline.slrm b/original_eval_scripts/CIFAR100/baseline.slrm index 50f3791..1c9ed80 100644 --- a/original_eval_scripts/CIFAR100/baseline.slrm +++ b/original_eval_scripts/CIFAR100/baseline.slrm @@ -9,8 +9,8 @@ #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=32 #SBATCH --mem=0 -#SBATCH --output=cifar10_baseline_160_%j.out -#SBATCH --error=cifar10_baseline_160_%j.err +#SBATCH --output=cifar100_baseline_160_%j.out +#SBATCH --error=cifar100_baseline_160_%j.err #SBATCH --open-mode=append #SBATCH --wait-all-nodes=1 #SBATCH --time=72:00:00 diff --git a/original_eval_scripts/CIFAR100/icgan_ab.slrm b/original_eval_scripts/CIFAR100/icgan_ab.slrm new file mode 100644 index 0000000..97b6373 --- /dev/null +++ b/original_eval_scripts/CIFAR100/icgan_ab.slrm @@ -0,0 +1,49 @@ +#!/bin/bash + +#SBATCH --job-name="cifar" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=icgan_ab_cifar100_%j.out +#SBATCH --error=icgan_ab_cifar100_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/ssd003/projects/aieng/genssl/swav_pretrained.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="cifar100" \ + --num_classes=100 \ + --ablation_mode="icgan" \ No newline at end of file diff --git a/original_eval_scripts/INaturalist/icgan_ab.slrm b/original_eval_scripts/INaturalist/icgan_ab.slrm new file mode 100644 index 0000000..d579a3f --- /dev/null +++ b/original_eval_scripts/INaturalist/icgan_ab.slrm @@ -0,0 +1,49 @@ +#!/bin/bash + +#SBATCH --job-name="inaturalist" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=icgan_ab_inaturalist_%j.out +#SBATCH --error=icgan_ab_inaturalist_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/datasets/inat_comp/2018/" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/ssd003/projects/aieng/genssl/swav_pretrained.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="INaturalist" \ + --num_classes=8142 \ + --ablation_mode="icgan" \ No newline at end of file diff --git a/original_eval_scripts/food101/icgan_ab.slrm b/original_eval_scripts/food101/icgan_ab.slrm new file mode 100644 index 0000000..30ce13e --- /dev/null +++ b/original_eval_scripts/food101/icgan_ab.slrm @@ -0,0 +1,49 @@ +#!/bin/bash + +#SBATCH --job-name="food101" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=icgan_ab_food101_%j.out +#SBATCH --error=icgan_ab_food101_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/ssd003/projects/aieng/genssl/swav_pretrained.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="food101" \ + --num_classes=101 \ + --ablation_mode="icgan" \ No newline at end of file diff --git a/original_eval_scripts/imagenet/icgan_ab.slrm b/original_eval_scripts/imagenet/icgan_ab.slrm new file mode 100644 index 0000000..0020867 --- /dev/null +++ b/original_eval_scripts/imagenet/icgan_ab.slrm @@ -0,0 +1,46 @@ +#!/bin/bash + +#SBATCH --job-name="sana_eval" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=icgan_ab_imagenet_%j.out +#SBATCH --error=icgan_ab_imagenet_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/scratch/ssd004/datasets/imagenet256" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars --batch-size=2048 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/ssd003/projects/aieng/genssl/swav_pretrained.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --ablation_mode="icgan" diff --git a/original_eval_scripts/places365/icgan_ab.slrm b/original_eval_scripts/places365/icgan_ab.slrm new file mode 100644 index 0000000..486933d --- /dev/null +++ b/original_eval_scripts/places365/icgan_ab.slrm @@ -0,0 +1,49 @@ +#!/bin/bash + +#SBATCH --job-name="places365" +#SBATCH --partition=rtx6000 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=icgan_ab_places365_%j.out +#SBATCH --error=icgan_ab_places365_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets/places365" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/ssd003/projects/aieng/genssl/swav_pretrained.pth.tar" \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="places365" \ + --num_classes=434 \ + --ablation_mode="icgan" \ No newline at end of file diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 0007e08..d9b4af8 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -170,10 +170,10 @@ parser.add_argument( "--ablation_mode", - default="ICGAN", + default="icgan", type=str, - help="Using ICGAN or stable diffusion feature extractor for ablation study.", - hint="ICGAN or stable_diffusion", + help="Using icgan or stable diffusion feature extractor for ablation study.", + hint="icgan or stable_diffusion", ) best_acc1 = 0 @@ -268,7 +268,8 @@ def print_pass(*args, flush=True): print("init_process_group", flush=True) torch.distributed.barrier() - if args.ablation_mode == "ICGAN": + if args.ablation_mode == "icgan": + print("=> using icgan feature extractor.", flush=True) model = data_utils.load_pretrained_feature_extractor( args.pretrained, feature_extractor="selfsupervised" ) From 35a875bd08a069d299e0e615b1dbf108920d8111 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 13:09:39 -0700 Subject: [PATCH 29/38] debug --- simsiam/linear_eval_original_code.py | 1 - 1 file changed, 1 deletion(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index d9b4af8..b890f39 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -173,7 +173,6 @@ default="icgan", type=str, help="Using icgan or stable diffusion feature extractor for ablation study.", - hint="icgan or stable_diffusion", ) best_acc1 = 0 From e2830b9ad7bfb9d43d6cb90455b8d9034d9e0063 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 13:11:48 -0700 Subject: [PATCH 30/38] update num classes --- simsiam/linear_eval_original_code.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index b890f39..f792c43 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -273,6 +273,8 @@ def print_pass(*args, flush=True): args.pretrained, feature_extractor="selfsupervised" ) + model.fc = nn.Linear(2048, args.num_classes) + # freeze all layers but the last fc for name, param in model.named_parameters(): if name not in ["fc.weight", "fc.bias"]: From 95addac955152d7f03ad8271de2bf3a9fd1b9c7c Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 13:13:47 -0700 Subject: [PATCH 31/38] update code --- simsiam/linear_eval_original_code.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index f792c43..3ab6fd1 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -282,6 +282,8 @@ def print_pass(*args, flush=True): # init the fc layer model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() + + args.start_epoch = 0 else: # create model print("=> creating model '{}'".format(args.arch), flush=True) From f044643e4bc46c6ea831c11a29b849b58c39ab18 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 13:23:29 -0700 Subject: [PATCH 32/38] update linear eval code --- simsiam/linear_eval_original_code.py | 90 +++++++++++++--------------- 1 file changed, 41 insertions(+), 49 deletions(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 3ab6fd1..15e8b33 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -267,50 +267,42 @@ def print_pass(*args, flush=True): print("init_process_group", flush=True) torch.distributed.barrier() - if args.ablation_mode == "icgan": - print("=> using icgan feature extractor.", flush=True) - model = data_utils.load_pretrained_feature_extractor( - args.pretrained, feature_extractor="selfsupervised" - ) - - model.fc = nn.Linear(2048, args.num_classes) - - # freeze all layers but the last fc - for name, param in model.named_parameters(): - if name not in ["fc.weight", "fc.bias"]: - param.requires_grad = False - # init the fc layer - model.fc.weight.data.normal_(mean=0.0, std=0.01) - model.fc.bias.data.zero_() - args.start_epoch = 0 - else: - # create model - print("=> creating model '{}'".format(args.arch), flush=True) - model = models.__dict__[args.arch]() - - model.fc = nn.Linear(2048, args.num_classes) - - print("model", model.state_dict().keys(), flush=True) - - # freeze all layers but the last fc - for name, param in model.named_parameters(): - if name not in ["fc.weight", "fc.bias"]: - param.requires_grad = False - # init the fc layer - model.fc.weight.data.normal_(mean=0.0, std=0.01) - model.fc.bias.data.zero_() - - # load from pre-trained, before DistributedDataParallel constructor - if args.pretrained: - if os.path.isfile(args.pretrained): - print("=> loading checkpoint '{}'".format(args.pretrained), flush=True) - checkpoint = torch.load(args.pretrained, map_location="cpu") - - # rename moco pre-trained keys - state_dict = checkpoint["state_dict"] - for k in list(state_dict.keys()): - # retain only encoder up to before the embedding layer + # create model + print("=> creating model '{}'".format(args.arch), flush=True) + model = models.__dict__[args.arch]() + + model.fc = nn.Linear(2048, args.num_classes) + + print("model", model.state_dict().keys(), flush=True) + + # freeze all layers but the last fc + for name, param in model.named_parameters(): + if name not in ["fc.weight", "fc.bias"]: + param.requires_grad = False + # init the fc layer + model.fc.weight.data.normal_(mean=0.0, std=0.01) + model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + if args.pretrained: + if os.path.isfile(args.pretrained): + print("=> loading checkpoint '{}'".format(args.pretrained), flush=True) + checkpoint = torch.load(args.pretrained, map_location="cpu") + + # rename moco pre-trained keys + state_dict = checkpoint["state_dict"] + for k in list(state_dict.keys()): + # retain only encoder up to before the embedding layer + if args.ablation_mode == "icgan": + if k.startswith("module") and not k.startswith( + "module.fc" + ): + # remove prefix + state_dict[k[len("module.") :]] = state_dict[k] + # delete renamed or unused k + del state_dict[k] + else: if k.startswith("module.encoder") and not k.startswith( "module.encoder.fc" ): @@ -319,13 +311,13 @@ def print_pass(*args, flush=True): # delete renamed or unused k del state_dict[k] - args.start_epoch = 0 - msg = model.load_state_dict(state_dict, strict=False) - assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + args.start_epoch = 0 + msg = model.load_state_dict(state_dict, strict=False) + assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} - print("=> loaded pre-trained model '{}'".format(args.pretrained)) - else: - print("=> no checkpoint found at '{}'".format(args.pretrained)) + print("=> loaded pre-trained model '{}'".format(args.pretrained)) + else: + print("=> no checkpoint found at '{}'".format(args.pretrained)) # infer learning rate before changing batch size init_lr = args.lr * args.batch_size / 256 From ee0a6749bc9cd8ac94bcb000e1bcbdcc840a5512 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 13:25:58 -0700 Subject: [PATCH 33/38] update icgan code --- simsiam/linear_eval_original_code.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 15e8b33..2278ead 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -291,7 +291,10 @@ def print_pass(*args, flush=True): checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys - state_dict = checkpoint["state_dict"] + if args.ablation_mode == "icgan": + state_dict = checkpoint + else: + state_dict = checkpoint["state_dict"] for k in list(state_dict.keys()): # retain only encoder up to before the embedding layer if args.ablation_mode == "icgan": From 6efa7ef179a515c18b778bb408e5e3eed6b65d4d Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 13:46:05 -0700 Subject: [PATCH 34/38] correct code --- simsiam/linear_eval_original_code.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 2278ead..9731f86 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -622,7 +622,8 @@ def print_pass(*args, flush=True): filename=checkpoint_file, ) if epoch == args.start_epoch: - sanity_check(model.state_dict(), args.pretrained) + if args.ablation_mode != "icgan": + sanity_check(model.state_dict(), args.pretrained) def train(train_loader, model, criterion, optimizer, epoch, args): From dbf6c4f79f5d97cd19793afc2ccb5686e1503d38 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Mon, 11 Mar 2024 15:04:41 -0700 Subject: [PATCH 35/38] add sanity check --- simsiam/linear_eval_original_code.py | 30 ++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/simsiam/linear_eval_original_code.py b/simsiam/linear_eval_original_code.py index 9731f86..5d85f85 100644 --- a/simsiam/linear_eval_original_code.py +++ b/simsiam/linear_eval_original_code.py @@ -622,8 +622,7 @@ def print_pass(*args, flush=True): filename=checkpoint_file, ) if epoch == args.start_epoch: - if args.ablation_mode != "icgan": - sanity_check(model.state_dict(), args.pretrained) + sanity_check(model.state_dict(), args.pretrained, args.ablation_mode) def train(train_loader, model, criterion, optimizer, epoch, args): @@ -740,26 +739,37 @@ def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): shutil.copyfile(filename, "model_best.pth.tar") -def sanity_check(state_dict, pretrained_weights): +def sanity_check(state_dict, pretrained_weights, ablation_mode): """ Linear classifier should not change any weights other than the linear layer. This sanity check asserts nothing wrong happens (e.g., BN stats updated). """ print("=> loading '{}' for sanity check".format(pretrained_weights)) checkpoint = torch.load(pretrained_weights, map_location="cpu") - state_dict_pre = checkpoint["state_dict"] + if ablation_mode == "icgan": + state_dict_pre = checkpoint + else: + state_dict_pre = checkpoint["state_dict"] for k in list(state_dict.keys()): # only ignore fc layer if "fc.weight" in k or "fc.bias" in k: continue + if ablation_mode == "icgan": + # name in pretrained model + k_pre = ( + "module." + k[len("module.") :] + if k.startswith("module.") + else "module." + k + ) - # name in pretrained model - k_pre = ( - "module.encoder." + k[len("module.") :] - if k.startswith("module.") - else "module.encoder." + k - ) + else: + # name in pretrained model + k_pre = ( + "module.encoder." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder." + k + ) assert ( state_dict[k].cpu() == state_dict_pre[k_pre] From 65b44c7d913eae18499511e25dab524722f27ed5 Mon Sep 17 00:00:00 2001 From: Vahid Reza Khazaie Date: Wed, 13 Mar 2024 08:48:03 -0400 Subject: [PATCH 36/38] add linear eval files with clip --- original_eval_scripts/imagenet/clip.slrm | 43 ++ simsiam/linear_eval_original_code_clip.py | 865 ++++++++++++++++++++++ 2 files changed, 908 insertions(+) create mode 100644 original_eval_scripts/imagenet/clip.slrm create mode 100644 simsiam/linear_eval_original_code_clip.py diff --git a/original_eval_scripts/imagenet/clip.slrm b/original_eval_scripts/imagenet/clip.slrm new file mode 100644 index 0000000..25f8638 --- /dev/null +++ b/original_eval_scripts/imagenet/clip.slrm @@ -0,0 +1,43 @@ +#!/bin/bash + +#SBATCH --job-name="clip_eval" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=clip_ab_imagenet_%j.out +#SBATCH --error=clip_ab_imagenet_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code_clip.py \ + --data="/scratch/ssd004/datasets/imagenet256" \ + --multiprocessing-distributed \ + --lars --batch-size=2048 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ \ No newline at end of file diff --git a/simsiam/linear_eval_original_code_clip.py b/simsiam/linear_eval_original_code_clip.py new file mode 100644 index 0000000..e737615 --- /dev/null +++ b/simsiam/linear_eval_original_code_clip.py @@ -0,0 +1,865 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import builtins +import math +import os +import random +import shutil +import time +import warnings +from datetime import datetime + +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +import torchvision.datasets as datasets +import torchvision.models as models +import torchvision.transforms as transforms +from tqdm import tqdm +import clip +from inatural_dataset import INAT + + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument( + "--data", + metavar="DIR", + default="/scratch/ssd004/datasets/imagenet256", + help="path to dataset.", +) +# parser.add_argument( +# "-a", +# "--arch", +# metavar="ARCH", +# default="resnet50", +# choices=model_names, +# help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +# ) +parser.add_argument( + "-j", + "--workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=90, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "-b", + "--batch-size", + default=4096, + type=int, + metavar="N", + help="mini-batch size (default: 4096), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.1, + type=float, + metavar="LR", + help="initial (base) learning rate", + dest="lr", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) +parser.add_argument( + "--resume", + default="", + type=str, + metavar="PATH", + help="path to latest checkpoint (default: none)", +) +parser.add_argument( + "-e", + "--evaluate", + dest="evaluate", + action="store_true", + help="evaluate model on validation set", +) +parser.add_argument( + "--world-size", + default=-1, + type=int, + help="number of nodes for distributed training", +) +parser.add_argument( + "--rank", default=-1, type=int, help="node rank for distributed training" +) +parser.add_argument( + "--dist-url", + default="tcp://224.66.41.62:23456", + type=str, + help="url used to set up distributed training", +) +parser.add_argument( + "--dist-backend", default="nccl", type=str, help="distributed backend" +) +parser.add_argument( + "--seed", default=None, type=int, help="seed for initializing training. " +) +parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.") +parser.add_argument( + "--multiprocessing-distributed", + action="store_true", + help="Use multi-processing distributed training to launch " + "N processes per node, which has N GPUs. This is the " + "fastest way to use PyTorch for either single node or " + "multi node data parallel training", +) + +# additional configs: +parser.add_argument( + "--pretrained", default="", type=str, help="path to simsiam pretrained checkpoint" +) +parser.add_argument("--lars", action="store_true", help="Use LARS") + +parser.add_argument("--dataset_name", default="imagenet", help="Name of the dataset.") + +parser.add_argument( + "--checkpoint_dir", + default="/projects/imagenet_synthetic/model_checkpoints", + help="Checkpoint root directory.", +) + +parser.add_argument( + "--num_classes", + default=1000, + type=int, + help="Number of classes in the dataset.", +) + +# parser.add_argument( +# "--ablation_mode", +# default="icgan", +# type=str, +# help="Using icgan or stable diffusion feature extractor for ablation study.", +# ) + +best_acc1 = 0 + +class CLIPClassifier(nn.Module): + def __init__(self, clip_model, num_classes): + super(CLIPClassifier, self).__init__() + self.clip_model = clip_model + + for param in self.clip_model.parameters(): + param.requires_grad = False + + self.linear = nn.Linear(clip_model.visual.output_dim, num_classes) + + def forward(self, x): + with torch.no_grad(): + x = self.clip_model.encode_image(x) + x = x.float() + x = self.linear(x) + return x + +def main(): + args = parser.parse_args() + current_time = datetime.now().strftime("%Y-%m-%d-%H-%M") + args.checkpoint_dir = os.path.join(args.checkpoint_dir, f"eval_{current_time}") + os.makedirs(args.checkpoint_dir, exist_ok=True) + + print(args) + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + # NOTE: this line can reduce speed considerably + # cudnn.deterministic = True + warnings.warn( + "You have chosen to seed training. " + "This will turn on the CUDNN deterministic setting, " + "which can slow down your training considerably! " + "You may see unexpected behavior when restarting " + "from checkpoints." + ) + + if args.gpu is not None: + warnings.warn( + "You have chosen a specific GPU. This will completely " + "disable data parallelism." + ) + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + print(args.world_size) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + print("second", args.world_size) + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn( + main_worker, + nprocs=ngpus_per_node, + args=( + ngpus_per_node, + args, + ), + ) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + print("spawn performed, gpu", gpu, flush=True) + args.gpu = gpu + + # suppress printing if not master + if args.multiprocessing_distributed and args.gpu != 0: + + def print_pass(*args, flush=True): + pass + + builtins.print = print_pass + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu), flush=True) + + if args.distributed: + print("here", flush=True) + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + print("rank", args.rank, flush=True) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + print("second rank", args.rank, flush=True) + dist.init_process_group( + backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, + ) + print("init_process_group", flush=True) + torch.distributed.barrier() + + + # create model + print("=> creating model", flush=True) + # model = models.__dict__[args.arch]() + + # model.fc = nn.Linear(2048, args.num_classes) + # Load the pre-trained CLIP model + model, _ = clip.load("ViT-B/32") + model = model.float() + model = CLIPClassifier(model, args.num_classes) + args.start_epoch = 0 + + print("model", model.state_dict().keys(), flush=True) + + # # freeze all layers but the last fc + # for name, param in model.named_parameters(): + # if name not in ["fc.weight", "fc.bias"]: + # param.requires_grad = False + # # init the fc layer + # model.fc.weight.data.normal_(mean=0.0, std=0.01) + # model.fc.bias.data.zero_() + + # load from pre-trained, before DistributedDataParallel constructor + # if args.pretrained: + # if os.path.isfile(args.pretrained): + # print("=> loading checkpoint '{}'".format(args.pretrained), flush=True) + # checkpoint = torch.load(args.pretrained, map_location="cpu") + + # # rename moco pre-trained keys + # if args.ablation_mode == "icgan": + # state_dict = checkpoint + # else: + # state_dict = checkpoint["state_dict"] + # for k in list(state_dict.keys()): + # # retain only encoder up to before the embedding layer + # if args.ablation_mode == "icgan": + # if k.startswith("module") and not k.startswith( + # "module.fc" + # ): + # # remove prefix + # state_dict[k[len("module.") :]] = state_dict[k] + # # delete renamed or unused k + # del state_dict[k] + # else: + # if k.startswith("module.encoder") and not k.startswith( + # "module.encoder.fc" + # ): + # # remove prefix + # state_dict[k[len("module.encoder.") :]] = state_dict[k] + # # delete renamed or unused k + # del state_dict[k] + # msg = model.load_state_dict(state_dict, strict=False) + # assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} + + # print("=> loaded pre-trained model '{}'".format(args.pretrained)) + # else: + # print("=> no checkpoint found at '{}'".format(args.pretrained)) + + # infer learning rate before changing batch size + init_lr = args.lr * args.batch_size / 256 + + if args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + args.batch_size = int(args.batch_size / ngpus_per_node) + print("batchsize", args.batch_size, flush=True) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + print("workers", args.workers, flush=True) + print("gpu", args.gpu, flush=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.gpu] + ) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + # if args.arch.startswith("alexnet") or args.arch.startswith("vgg"): + # model.features = torch.nn.DataParallel(model.features) + # model.cuda() + # else: + model = torch.nn.DataParallel(model).cuda() + + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + + # optimize only the linear classifier + parameters = list(filter(lambda p: p.requires_grad, model.parameters())) + assert len(parameters) == 2 # fc.weight, fc.bias + + optimizer = torch.optim.SGD( + parameters, init_lr, momentum=args.momentum, weight_decay=args.weight_decay + ) + if args.lars: + print("=> use LARS optimizer.", flush=True) + from LARC import LARC + + optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume), flush=True) + if args.gpu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single gpu. + loc = "cuda:{}".format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint["epoch"] + best_acc1 = checkpoint["best_acc1"] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint["state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + print( + "=> loaded checkpoint '{}' (epoch {})".format( + args.resume, checkpoint["epoch"] + ), + flush=True, + ) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = os.path.join(args.data, "train") + valdir = os.path.join(args.data, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + if args.dataset_name == "imagenet": + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ), + ) + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ), + ) + elif args.dataset_name == "food101": + print("=> using food101 dataset.", flush=True) + train_dataset = datasets.Food101( + root=args.data, + split="train", + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + val_dataset = datasets.Food101( + root=args.data, + split="test", + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ) + elif args.dataset_name == "cifar10": + train_dataset = datasets.CIFAR10( + root=args.data, + train=True, + download=True, + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) + val_dataset = datasets.CIFAR10( + root=args.data, + train=False, + download=True, + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) + elif args.dataset_name == "cifar100": + train_dataset = datasets.CIFAR100( + root=args.data, + train=True, + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) + val_dataset = datasets.CIFAR100( + root=args.data, + train=False, + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ], + ), + ) + elif args.dataset_name == "places365": + train_dataset = datasets.Places365( + root=args.data, + split="train-standard", + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + val_dataset = datasets.Places365( + root=args.data, + split="val", + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ) + elif args.dataset_name == "INaturalist": + train_dataset = INAT( + root=args.data, + ann_file=os.path.join(args.data, "train2018.json"), + transform=transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ], + ), + ) + val_dataset = INAT( + root=args.data, + ann_file=os.path.join(args.data, "val2018.json"), + transform=transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ], + ), + ) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + num_workers=args.workers, + pin_memory=True, + sampler=train_sampler, + ) + + val_loader = torch.utils.data.DataLoader( + val_dataset, + batch_size=256, + shuffle=False, + num_workers=args.workers, + pin_memory=True, + ) + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, init_lr, epoch, args) + + print("epoch", epoch, flush=True) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, args) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or ( + args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 + ): + checkpoint_name = "checkpoint_{:04d}.pth.tar".format(epoch + 1) + checkpoint_file = os.path.join(args.checkpoint_dir, checkpoint_name) + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + filename=checkpoint_file, + ) + # if epoch == args.start_epoch: + # sanity_check(model.state_dict(), args.pretrained, args.ablation_mode) + + +def train(train_loader, model, criterion, optimizer, epoch, args): + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch), + ) + + """ + Switch to eval mode: + Under the protocol of linear classification on frozen features/models, + it is not legitimate to change any part of the pre-trained model. + BatchNorm in train mode may revise running mean/std (even if it receives + no gradient), which are part of the model parameters too. + """ + model.eval() + + end = time.time() + i = 0 + for images, target in tqdm(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + if i == 0: + print("first step passed", flush=True) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + i += 1 + + +def validate(val_loader, model, criterion, args): + batch_time = AverageMeter("Time", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") + progress = ProgressMeter( + len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " + ) + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + i = 0 + for images, target in tqdm(val_loader): + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i) + + i += 1 + + # # TODO: this should also be done with the ProgressMeter + print( + "\n * Accuracy@1 {top1.avg:.3f} Accuracy@5 {top5.avg:.3f}".format( + top1=top1, top5=top5 + ) + ) + + return top1.avg + + +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") + + +# def sanity_check(state_dict, pretrained_weights, ablation_mode): +# """ +# Linear classifier should not change any weights other than the linear layer. +# This sanity check asserts nothing wrong happens (e.g., BN stats updated). +# """ +# print("=> loading '{}' for sanity check".format(pretrained_weights)) +# checkpoint = torch.load(pretrained_weights, map_location="cpu") +# if ablation_mode == "icgan": +# state_dict_pre = checkpoint +# else: +# state_dict_pre = checkpoint["state_dict"] + +# for k in list(state_dict.keys()): +# # only ignore fc layer +# if "fc.weight" in k or "fc.bias" in k: +# continue +# if ablation_mode == "icgan": +# # name in pretrained model +# k_pre = ( +# "module." + k[len("module.") :] +# if k.startswith("module.") +# else "module." + k +# ) + +# else: +# # name in pretrained model +# k_pre = ( +# "module.encoder." + k[len("module.") :] +# if k.startswith("module.") +# else "module.encoder." + k +# ) + +# assert ( +# state_dict[k].cpu() == state_dict_pre[k_pre] +# ).all(), "{} is changed in linear classifier training.".format(k) + +# print("=> sanity check passed.") + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=":f"): + self.name = name + self.fmt = fmt + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print("\t".join(entries), flush=True) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + +def adjust_learning_rate(optimizer, init_lr, epoch, args): + """Decay the learning rate based on schedule""" + cur_lr = init_lr * 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) + for param_group in optimizer.param_groups: + param_group["lr"] = cur_lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == "__main__": + main() From c2a4a9c89c1ed27feff5abac9e11b49f3170cbd2 Mon Sep 17 00:00:00 2001 From: Vahid Reza Khazaie Date: Thu, 14 Mar 2024 09:31:33 -0400 Subject: [PATCH 37/38] add slurm scripts for experiments --- original_eval_scripts/CIFAR10/icgan.slrm | 47 ++++++++++++++++++ .../INaturalist/baseline.slrm | 2 +- original_eval_scripts/INaturalist/icgan.slrm | 48 +++++++++++++++++++ .../INaturalist/stablediff.slrm | 48 +++++++++++++++++++ original_eval_scripts/places365/icgan.slrm | 47 ++++++++++++++++++ 5 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 original_eval_scripts/CIFAR10/icgan.slrm create mode 100644 original_eval_scripts/INaturalist/icgan.slrm create mode 100644 original_eval_scripts/INaturalist/stablediff.slrm create mode 100644 original_eval_scripts/places365/icgan.slrm diff --git a/original_eval_scripts/CIFAR10/icgan.slrm b/original_eval_scripts/CIFAR10/icgan.slrm new file mode 100644 index 0000000..b416f0f --- /dev/null +++ b/original_eval_scripts/CIFAR10/icgan.slrm @@ -0,0 +1,47 @@ +#!/bin/bash + +#SBATCH --job-name="cifar" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=cifar10_baseline_160_%j.out +#SBATCH --error=cifar10_baseline_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="cifar10" \ No newline at end of file diff --git a/original_eval_scripts/INaturalist/baseline.slrm b/original_eval_scripts/INaturalist/baseline.slrm index d7c0acb..84af317 100644 --- a/original_eval_scripts/INaturalist/baseline.slrm +++ b/original_eval_scripts/INaturalist/baseline.slrm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name="inaturalist" -#SBATCH --partition=a40 +#SBATCH --partition=t4v2 #SBATCH --qos=deadline #SBATCH --account=deadline #SBATCH --nodes=1 diff --git a/original_eval_scripts/INaturalist/icgan.slrm b/original_eval_scripts/INaturalist/icgan.slrm new file mode 100644 index 0000000..fbe6a3e --- /dev/null +++ b/original_eval_scripts/INaturalist/icgan.slrm @@ -0,0 +1,48 @@ +#!/bin/bash + +#SBATCH --job-name="inaturalist" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=inaturalist_icgan_160_%j.out +#SBATCH --error=inaturalist_icgan_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/datasets/inat_comp/2018/" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="INaturalist" \ + --num_classes=8142 \ No newline at end of file diff --git a/original_eval_scripts/INaturalist/stablediff.slrm b/original_eval_scripts/INaturalist/stablediff.slrm new file mode 100644 index 0000000..a3baaa1 --- /dev/null +++ b/original_eval_scripts/INaturalist/stablediff.slrm @@ -0,0 +1,48 @@ +#!/bin/bash + +#SBATCH --job-name="inaturalist" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=inaturalist_stablediff_160_%j.out +#SBATCH --error=inaturalist_stablediff_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/datasets/inat_comp/2018/" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_stablediff_p0p5_seed43_2024-03-05-13-39/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="INaturalist" \ + --num_classes=8142 \ No newline at end of file diff --git a/original_eval_scripts/places365/icgan.slrm b/original_eval_scripts/places365/icgan.slrm new file mode 100644 index 0000000..841bbe9 --- /dev/null +++ b/original_eval_scripts/places365/icgan.slrm @@ -0,0 +1,47 @@ +#!/bin/bash + +#SBATCH --job-name="places365" +#SBATCH --partition=a40 +#SBATCH --qos=deadline +#SBATCH --account=deadline +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=0 +#SBATCH --output=places365_icgan_160_%j.out +#SBATCH --error=places365_icgan_160_%j.err +#SBATCH --open-mode=append +#SBATCH --wait-all-nodes=1 +#SBATCH --time=72:00:00 + +# load virtual environment +source /ssd003/projects/aieng/envs/genssl2/bin/activate + +export NCCL_IB_DISABLE=1 # Our cluster does not have InfiniBand. We need to disable usage using this flag. +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 # set to 1 for NCCL backend +# export CUDA_LAUNCH_BLOCKING=1 +export MASTER_ADDR="$(hostname --fqdn)" +export MASTER_PORT="$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1])')" +export RDVZ_ID=$RANDOM +echo "RDZV Endpoint $MASTER_ADDR:$MASTER_PORT" + +echo $MASTER_ADDR +echo $MASTER_PORT + +export PYTHONPATH="." +nvidia-smi + +python simsiam/linear_eval_original_code.py \ + --data="/projects/imagenet_synthetic/fereshteh_datasets/places365" \ + --arch="resnet50" \ + --multiprocessing-distributed \ + --lars \ + --batch-size=4096 \ + --epochs=100 \ + -j=16 \ + --world-size 1 \ + --rank 0 \ + --pretrained="/projects/imagenet_synthetic/model_checkpoints/simsiam_icgan_seed43_bs128_rforig_2024-03-05-12-52/checkpoint_0160.pth.tar"\ + --dist-url "tcp://$MASTER_ADDR:$MASTER_PORT" \ + --dataset_name="places365" \ No newline at end of file From f88558f73e373e1fdea00590c9b8100c0668fc10 Mon Sep 17 00:00:00 2001 From: sanaAyrml Date: Thu, 28 Mar 2024 11:05:44 -0400 Subject: [PATCH 38/38] Update gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a0b56cf..656e049 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ dmypy.json # pycharm .idea/ + +# Trained models +trained_models/