CathodeX/Dockerfile.training at main · Kernel-Guard/CathodeX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Build: docker build -t gcr.io/cathode-screening/chgnet-training:latest .
# Push: docker push gcr.io/cathode-screening/chgnet-training:latest

FROM nvidia/cuda:12.1.1-devel-ubuntu22.04

# Avoid interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    python3.10-venv \
    git \
    wget \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Set Python alias
RUN ln -sf /usr/bin/python3.10 /usr/bin/python

# Upgrade pip
RUN pip install --upgrade pip

# Install PyTorch with CUDA 12.1
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Install CHGNet and dependencies
RUN pip install \
    chgnet>=0.3.0 \
    pymatgen>=2024.1.1 \
    numpy>=1.24 \
    pandas>=2.0 \
    scikit-learn>=1.3 \
    tqdm>=4.65 \
    mp-api>=0.37 \
    python-dotenv>=1.0

# Additional training utilities
RUN pip install \
    tensorboard>=2.14 \
    wandb>=0.16 \
    pyarrow>=14.0

# Create working directory
WORKDIR /app

# Copy training scripts
COPY scripts/ /app/scripts/
COPY src/ /app/src/
COPY requirements.txt /app/

# Install any remaining requirements
RUN pip install -r requirements.txt 2>/dev/null || true

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV CUDA_VISIBLE_DEVICES=0

# Default command: Download data, then FINE-TUNE ONLY (skips 12GB pretrain data), then upload results
CMD ["bash", "-c", "python scripts/download_data.py && python scripts/36_train_gcp_l4.py --phase finetune && python scripts/upload_results.py"]