AILab-CVC · tim-win · Aug 31, 2024 · Aug 31, 2024 · Aug 31, 2024 · Aug 31, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,3 @@
 docs
-Dockerfile
+Dockerfile
+.idea
diff --git a/Dockerfile b/Dockerfile
@@ -1,43 +1,63 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+# Base image with CUDA support
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS base
 
-ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
-ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
-
-ENV FORCE_CUDA="1"
-ENV MMCV_WITH_OPS=1
+# Set environment variables
+ENV FORCE_CUDA="1" \
+    MMCV_WITH_OPS=1 \
+    DEBIAN_FRONTEND=noninteractive
 
+# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3-pip     \
+    python3-pip \
     libgl1-mesa-glx \
-    libsm6          \
-    libxext6        \
-    libxrender-dev  \
-    libglib2.0-0    \
-    git             \
-    python3-dev     \
-    python3-wheel
-
-RUN pip3 install --upgrade pip \
-    && pip3 install   \
-        gradio        \
-        opencv-python \
-        supervision   \
-        mmengine      \
-        setuptools    \
-        openmim       \
-    && mim install mmcv==2.0.0 \
-    && pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \
-        wheel         \
-        torch         \
-        torchvision   \
-        torchaudio
-
-COPY . /yolo
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libglib2.0-0 \
+    git \
+    python3-dev \
+    python3-wheel \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+FROM base AS python_deps
+
+RUN pip3 install --upgrade pip wheel \
+    && pip3 install --no-cache-dir torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 \
+    && pip3 install --no-cache-dir \
+        gradio==4.16.0 \
+        opencv-python==4.9.0.80 \
+        supervision \
+        mmengine==0.10.4 \
+        setuptools \
+        openmim \
+        onnx \
+        onnxsim \
+    && mim install mmcv==2.1.0 \
+    && mim install mmdet==3.3.0 \
+    && pip3 install --no-cache-dir git+https://github.com/onuralpszr/mmyolo.git
+
+# Clone and install YOLO-World
+FROM python_deps AS yolo_world
+
+RUN git clone --recursive https://github.com/AILab-CVC/YOLO-World /yolo/
 WORKDIR /yolo
 
-RUN pip3 install -e .
+RUN pip3 install -e .[demo]
+
+# Final stage
+FROM yolo_world AS final
+
+ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
+ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
+
+# Create weights directory and set permissions
+RUN mkdir /weights/ \
+    && chmod a+rwx /yolo/configs/*/*
 
-RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
+# Optionally download weights (commented out by default)
+# RUN curl -o /weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
 
-ENTRYPOINT [ "python3", "demo.py" ]
-CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
+# Set the default command
+CMD ["bash"]
diff --git a/README.md b/README.md
@@ -37,6 +37,7 @@ We recommend that everyone **use English to communicate on issues**, as this hel
 For business licensing and other related inquiries, don't hesitate to contact `[email protected]`.
 
 ## 🔥 Updates 
+`[2024-8-31]`: Segmentation demo added to the demo/ folder. Try it out in docker with `./build_and_run.sh seg-l`!  
 `[2024-7-8]`: YOLO-World now has been integrated into [ComfyUI](https://github.com/StevenGrove/ComfyUI-YOLOWorld)! Come and try adding YOLO-World to your workflow now! You can access it at [StevenGrove/ComfyUI-YOLOWorld](https://github.com/StevenGrove/ComfyUI-YOLOWorld)!  
 `[2024-5-18]:` YOLO-World models have been [integrated with the FiftyOne computer vision toolkit](https://docs.voxel51.com/integrations/ultralytics.html#open-vocabulary-detection) for streamlined open-vocabulary inference across image and video datasets.  
 `[2024-5-16]:` Hey guys! Long time no see! This update contains (1) [fine-tuning guide](https://github.com/AILab-CVC/YOLO-World?#highlights--introduction) and (2) [TFLite Export](./docs/tflite_deploy.md) with INT8 Quantization.  

diff --git a/build_and_run.sh b/build_and_run.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+# Set MODEL_DIR if not already set in the environment
+: "${MODEL_DIR:="../models/models-yoloworld"}"
+
+# DocString for the script
+: '
+This script builds and runs a Docker container for YOLO-World demos.
+It supports various pre-trained models and configurations for object detection and segmentation.
+
+Usage:
+    ./build_and_run.sh <model-key>
+
+Environment Variables:
+    MODEL_DIR: Path to the directory containing model weights (default: "../models/models-yoloworld")
+
+Arguments:
+    <model-key>: Key for the desired model configuration (see available keys below)
+
+Available model keys:
+    seg-l, seg-l-seghead, seg-m, seg-m-seghead,
+    pretrain-l-clip-800ft, pretrain-l-clip, pretrain-l-1280ft, pretrain-l,
+    pretrain-m-1280ft, pretrain-m, pretrain-s-1280ft, pretrain-s,
+    pretrain-x-cc3mlite, pretrain-x-1280ft
+'
+
+# Define associative array for model configurations
+declare -A models
+models["seg-l"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth"
+models["seg-l-seghead"]="yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-5a642d30.pth"
+models["seg-m"]="yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-ca465825.pth"
+models["seg-m-seghead"]="yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-7bca59a7.pth"
+models["pretrain-l-clip-800ft"]="yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py yolo_world_v2_l_clip_large_o365v1_goldg_pretrain_800ft-9df82e55.pth"
+models["pretrain-l-clip"]="yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_l_clip_large_o365v1_goldg_pretrain-8ff2e744.pth"
+models["pretrain-l-1280ft"]="yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth"
+models["pretrain-l"]="yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth"
+models["pretrain-m-1280ft"]="yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_m_obj365v1_goldg_pretrain_1280ft-77d0346d.pth"
+models["pretrain-m"]="yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_m_obj365v1_goldg_pretrain-c6237d5b.pth"
+models["pretrain-s-1280ft"]="yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_s_obj365v1_goldg_pretrain_1280ft-fc4ff4f7.pth"
+models["pretrain-s"]="yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth"
+models["pretrain-x-cc3mlite"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth"
+models["pretrain-x-1280ft"]="yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth"
+
+# Function to display usage information
+show_usage() {
+    echo "Usage: $0 <model-key>"
+    echo "Available model keys:"
+    for key in "${!models[@]}"; do
+        echo "  $key"
+    done
+}
+
+# Check if a model key is provided
+if [ $# -eq 0 ]; then
+    show_usage
+    exit 1
+fi
+
+model_key=$1
+
+# Validate the model key
+if [ -z "${models[$model_key]}" ]; then
+    echo "Invalid model key."
+    show_usage
+    exit 1
+fi
+
+# Extract model and weight information
+read -r MODEL WEIGHT <<< "${models[$model_key]}"
+
+# Set configuration directory and demo file based on model type
+config_dir="configs/pretrain"
+demo_file="demo/gradio_demo.py"
+if [[ $model_key == seg-* ]]; then
+    config_dir="configs/segmentation"
+    demo_file="demo/segmentation_demo.py"
+fi
+
+# Build Docker image and run container
+echo "Building Docker image..."
+docker build -f ./Dockerfile --no-cache \
+    --build-arg="MODEL=$MODEL" \
+    --build-arg="WEIGHT=$WEIGHT" \
+    -t "yolo-demo:latest" .
+
+echo "Running Docker container..."
+docker run -it \
+    -v "$(readlink -f "$MODEL_DIR"):/weights/" \
+    --runtime nvidia \
+    -p 8080:8080 \
+    "yolo-demo:latest" \
+    python3 "$demo_file" "$config_dir/$MODEL" "/weights/$WEIGHT"
diff --git a/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py
@@ -18,7 +18,6 @@
 weight_decay = 0.05
 train_batch_size_per_gpu = 16
 load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth'
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 persistent_workers = False
 

diff --git a/...in/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/...in/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
@@ -16,7 +16,6 @@
 weight_decay = 0.025
 train_batch_size_per_gpu = 4
 load_from = "pretrained_models/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth"
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 img_scale = (1280, 1280)
 

diff --git a/.../pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/.../pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(

diff --git a/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py b/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(

diff --git a/...in/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/...in/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
@@ -16,6 +16,7 @@
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
 text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
+text_model_name = 'openai/clip-vit-base-patch32'
 img_scale = (1280, 1280)
 
 text_model_name = 'openai/clip-vit-base-patch32'

diff --git a/.../yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py b/.../yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_cc3mlite_train_lvis_minival.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(

diff --git a/...in/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/...in/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py
@@ -15,8 +15,7 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
-# text_model_name = 'openai/clip-vit-base-patch32'
+text_model_name = 'openai/clip-vit-base-patch32'
 img_scale = (1280, 1280)
 
 # model settings

diff --git a/.../pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/.../pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
@@ -15,7 +15,6 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 # model settings
 model = dict(

diff --git a/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py b/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py
@@ -18,7 +18,7 @@
 load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth'
 persistent_workers = False
 text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
-# text_model_name = 'openai/clip-vit-base-patch32'
+text_model_name = 'openai/clip-vit-base-patch32'
 # Polygon2Mask
 downsample_ratio = 4
 mask_overlap = False

diff --git a/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py b/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
@@ -16,7 +16,6 @@
 weight_decay = 0.05
 train_batch_size_per_gpu = 8
 load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
-# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
 text_model_name = 'openai/clip-vit-base-patch32'
 persistent_workers = False
 

diff --git a/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py b/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py
@@ -17,7 +17,7 @@
 train_batch_size_per_gpu = 8
 load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth'
 text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
-# text_model_name = 'openai/clip-vit-base-patch32'
+text_model_name = 'openai/clip-vit-base-patch32'
 persistent_workers = False
 
 # Polygon2Mask

diff --git a/demo/README.md b/demo/README.md
@@ -19,11 +19,24 @@ pip install gradio==4.16.0
 python demo/demo.py path/to/config path/to/weights
 ```
 
-Additionaly, you can use a Dockerfile to build an image with gradio. As a prerequisite, make sure you have respective drivers installed alongside [nvidia-container-runtime](https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime). Replace MODEL_NAME and WEIGHT_NAME with the respective values or ommit this and use default values from the [Dockerfile](Dockerfile#3)
+Additionally, you can use our Docker build system for an easier setup:
 
 ```bash
-docker build --build-arg="MODEL=MODEL_NAME" --build-arg="WEIGHT=WEIGHT_NAME" -t yolo_demo .
-docker run --runtime nvidia -p 8080:8080
+./build_and_run.sh <model-key>
+```
+
+Available model keys include:
+- seg-l, seg-l-seghead, seg-m, seg-m-seghead
+- pretrain-l-clip-800ft, pretrain-l-clip, pretrain-l-1280ft, pretrain-l
+- pretrain-m-1280ft, pretrain-m, pretrain-s-1280ft, pretrain-s
+- pretrain-x-cc3mlite, pretrain-x-1280ft
+
+This script will build the Docker image and run the container with the specified model configuration. The Gradio interface will be accessible at `http://localhost:8080`.
+
+You can also customize the model weights directory by setting the `MODEL_DIR` environment variable:
+
+```bash
+MODEL_DIR=/path/to/your/weights ./build_and_run.sh <model-key>
 ```
 
 #### Image Demo