This script is designed to extract image-caption pairs from a directory containing tar files of scientific articles.
python3 extract_image_caption.py -i <input_directory> -o <output_directory> -n -l <log level: info | debug> -flatten <flatten output directory or not, should use True> --omit_image_file <Use to set to output image files or not [True | False]> --caption_output_type parquet <Caption file output type [csv | parquet]>
-
Create virtual environment
conda create -p<path to directory>python=3.11 -
Activate virtual environment
conda activate<path to directory> -
Install Pytorch (https://pytorch.org/get-started/locally/)
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -
Install pandas, dask, pyarrow
conda install pandas dask pyarrow -
Install mpi4py (optional, use in batch script only)
conda install mpi4py -
Install clip package or clip huggingface
conda install -c huggingface transformers -
Install faiss package for nearest neighbor search
conda install -c conda-forge faiss -
Install other dependencies
conda install scikit-learn accelerate -c conda-forge -
Install ScientificImageCaption Package
pip install git+https://github.com/Khempawin/scientific-image-caption-pair.git
conda create -n faiss python=3.11.7
conda activate faiss
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
conda install conda-forge:pandas conda-forge:numpy conda-forge:matplotlib conda-forge:scikit-learn
conda install conda-forge:pyarrow
conda install conda-forge:transformers conda-forge:datasets conda-forge::accelerate
conda install -c pytorch -c nvidia faiss-gpu=1.8.0
conda install conda-forge::mpi4py
- Dev interactive
conda install tqdm jupyter ipywidgets ipython
-
Create virtual environment
python -m venv directory -
Activate virtual environment
. /path/to/virutal_env/bin/activate -
Upgrade pip
python -m pip install --upgrade pip -
Install Pytorch (https://pytorch.org/get-started/locally/)
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -
Install pandas, dask, pyarrow
pip install pandas dask pyarrow -
Install clip package or clip huggingface
-
Install ScientificImageCaption Package
-
Install mpi4py (optional, use in batch script only)