[EAGLE-5448]: upload improvements (#498)

* don't use shell for hf token validation * update dep in dockerfile * change version
Clarifai · Jan 24, 2025 · 0830812 · 0830812
1 parent 21072ba
commit 0830812
Show file tree

Hide file tree

Showing 11 changed files with 153 additions and 57 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## [[11.0.7]](https://github.com/Clarifai/clarifai-python/releases/tag/11.0.7) - [PyPI](https://pypi.org/project/clarifai/11.0.7/) - 2025-01-24
+
+### Changed
+
+ - Updated model upload experience [(#498)] (https://github.com/Clarifai/clarifai-python/pull/498)
+
 ## [[11.0.6]](https://github.com/Clarifai/clarifai-python/releases/tag/11.0.6) - [PyPI](https://pypi.org/project/clarifai/11.0.6/) - 2025-01-24
 
 ### Changed

diff --git a/clarifai/__init__.py b/clarifai/__init__.py
@@ -1 +1 @@
-__version__ = "11.0.6"
+__version__ = "11.0.7"
diff --git a/clarifai/cli/__main__.py b/clarifai/cli/__main__.py
@@ -0,0 +1,4 @@
+from clarifai.cli.base import main
+
+if __name__ == "__main__":
+  main()
diff --git a/clarifai/cli/base.py b/clarifai/cli/base.py
@@ -108,5 +108,6 @@ def login(ctx, config, env, user_id):
 # Import the CLI commands to register them
 load_command_modules()
 
-if __name__ == '__main__':
+
+def main():
   cli()
diff --git a/clarifai/cli/model.py b/clarifai/cli/model.py
@@ -5,8 +5,7 @@
 
 @cli.group(['model'])
 def model():
-  """Manage models: upload, test locally, run_locally, predict"""
-  pass
+  """Manage models: upload, test locally, run locally, predict, and more"""
 
 
 @model.command()
@@ -34,6 +33,28 @@ def upload(model_path, download_checkpoints, skip_dockerfile):
   model_upload.main(model_path, download_checkpoints, skip_dockerfile)
 
 
+@model.command()
+@click.option(
+    '--model_path',
+    type=click.Path(exists=True),
+    required=True,
+    help='Path to the model directory.')
+@click.option(
+    '--out_path',
+    type=click.Path(exists=False),
+    required=False,
+    default=None,
+    help=
+    'Option path to write the checkpoints to. This will place them in {out_path}/ If not provided it will default to {model_path}/1/checkpoints where the config.yaml is read..'
+)
+def download_checkpoints(model_path, out_path):
+  """Download checkpoints from external source to local model_path"""
+
+  from clarifai.runners.models.model_upload import ModelUploader
+  uploader = ModelUploader(model_path, download_validation_only=True)
+  uploader.download_checkpoints(out_path)
+
+
 @model.command()
 @click.option(
     '--model_path',

diff --git a/clarifai/runners/dockerfile_template/Dockerfile.template b/clarifai/runners/dockerfile_template/Dockerfile.template
@@ -1,43 +1,82 @@
-FROM --platform=$TARGETPLATFORM ${BASE_IMAGE} as build
+# syntax=docker/dockerfile:1
+#############################
+# User specific requirements installed in the pip_packages
+#############################
+FROM --platform=$TARGETPLATFORM ${BUILDER_IMAGE} as pip_packages
 
-ENV DEBIAN_FRONTEND=noninteractive
+COPY --link requirements.txt /home/nonroot/requirements.txt
+
+# Update clarifai package so we always have latest protocol to the API. Everything should land in /venv
+RUN pip install --no-cache-dir -r /home/nonroot/requirements.txt && \
+    (pip install --upgrade --upgrade-strategy only-if-needed --no-deps --no-cache-dir clarifai clarifai-grpc clarifai-protocol || true)
+#############################
 
 #############################
-# User specific requirements
+# Downloader dependencies image
 #############################
-COPY requirements.txt .
+FROM --platform=$TARGETPLATFORM ${DOWNLOADER_IMAGE} as downloader
 
-# Install requirements and clarifai package and cleanup before leaving this line.
-# Note(zeiler): this could be in a future template as {{model_python_deps}}
-RUN pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir clarifai
+# make sure we have the latest clarifai package.
+RUN (pip install --upgrade --upgrade-strategy only-if-needed --no-cache-dir clarifai clarifai-grpc clarifai-protocol || true)
+#####
 
-# These will be set by the templaing system.
-ENV CLARIFAI_PAT=${CLARIFAI_PAT}
-ENV CLARIFAI_USER_ID=${CLARIFAI_USER_ID}
-ENV CLARIFAI_RUNNER_ID=${CLARIFAI_RUNNER_ID}
-ENV CLARIFAI_NODEPOOL_ID=${CLARIFAI_NODEPOOL_ID}
-ENV CLARIFAI_COMPUTE_CLUSTER_ID=${CLARIFAI_COMPUTE_CLUSTER_ID}
-ENV CLARIFAI_API_BASE=${CLARIFAI_API_BASE}
+
+#############################
+# Final runtime image
+#############################
+FROM --platform=$TARGETPLATFORM ${RUNTIME_IMAGE} as final
 
 # Set the NUMBA cache dir to /tmp
-ENV NUMBA_CACHE_DIR=/tmp/numba_cache
 # Set the TORCHINDUCTOR cache dir to /tmp
-ENV TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor_cache
-ENV HOME=/tmp
+# The CLARIFAI* will be set by the templaing system.
+ENV NUMBA_CACHE_DIR=/tmp/numba_cache \
+    TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor_cache \
+    HOME=/tmp \
+    DEBIAN_FRONTEND=noninteractive
 
-# Set the working directory to /app
-WORKDIR /app
+#####
+# Copy the python requirements needed to download checkpoints
+#####
+COPY --link=true --from=downloader /venv /venv
+#####
 
-# Copy the current folder into /app/model_dir that the SDK will expect.
-# Note(zeiler): would be nice to exclude checkpoints in case they were pre-downloaded.
-COPY . /app/model_dir/${name}
+#####
+# Copy the files needed to download
+#####
+# This creates the directory that HF downloader will populate and with nonroot:nonroot permissions up.
+COPY --chown=nonroot:nonroot downloader/unused.yaml /home/nonroot/main/1/checkpoints/.cache/unused.yaml
 
-# Add the model directory to the python path.
-ENV PYTHONPATH=${PYTHONPATH}:/app/model_dir/${name}
+#####
+# Download checkpoints
+COPY --link=true config.yaml /home/nonroot/main/
+RUN ["python", "-m", "clarifai.cli", "model", "download-checkpoints", "--model_path", "/home/nonroot/main", "--out_path", "/home/nonroot/main"]
+#####
 
-ENTRYPOINT ["python", "-m", "clarifai.runners.server"]
+
+#####
+# Copy the python packages from the previous stage.
+COPY --link=true --from=pip_packages /venv /venv
+#####
+
+# Copy in the actual files like config.yaml, requirements.txt, and most importantly 1/model.py
+# for the actual model.
+# If checkpoints aren't downloaded since a checkpoints: block is not provided, then they will
+# be in the build context and copied here as well.
+COPY --link=true 1/model.py /home/nonroot/main/1/model.py
+# At this point we only need these for validation in the SDK.
+COPY --link=true requirements.txt config.yaml /home/nonroot/main/
+
+# Add the model directory to the python path.
+ENV PYTHONPATH=${PYTHONPATH}:/home/nonroot/main \
+    CLARIFAI_PAT=${CLARIFAI_PAT} \
+    CLARIFAI_USER_ID=${CLARIFAI_USER_ID} \
+    CLARIFAI_RUNNER_ID=${CLARIFAI_RUNNER_ID} \
+    CLARIFAI_NODEPOOL_ID=${CLARIFAI_NODEPOOL_ID} \
+    CLARIFAI_COMPUTE_CLUSTER_ID=${CLARIFAI_COMPUTE_CLUSTER_ID} \
+    CLARIFAI_API_BASE=${CLARIFAI_API_BASE}
 
 # Finally run the clarifai entrypoint to start the runner loop and local dev server.
 # Note(zeiler): we may want to make this a clarifai CLI call.
-CMD ["--model_path", "/app/model_dir/main"]
+ENTRYPOINT ["python", "-m", "clarifai.runners.server"]
+CMD ["--model_path", "/home/nonroot/main"]
+#############################
diff --git a/clarifai/runners/models/model_upload.py b/clarifai/runners/models/model_upload.py
@@ -13,9 +13,9 @@
 from rich.markup import escape
 
 from clarifai.client import BaseClient
-from clarifai.runners.utils.const import (AVAILABLE_PYTHON_IMAGES, AVAILABLE_TORCH_IMAGES,
-                                          CONCEPTS_REQUIRED_MODEL_TYPE, DEFAULT_PYTHON_VERSION,
-                                          PYTHON_BASE_IMAGE, TORCH_BASE_IMAGE)
+from clarifai.runners.utils.const import (
+    AVAILABLE_PYTHON_IMAGES, AVAILABLE_TORCH_IMAGES, CONCEPTS_REQUIRED_MODEL_TYPE,
+    DEFAULT_PYTHON_VERSION, PYTHON_BUILDER_IMAGE, PYTHON_RUNTIME_IMAGE, TORCH_BASE_IMAGE)
 from clarifai.runners.utils.loader import HuggingFaceLoader
 from clarifai.urls.helper import ClarifaiUrlHelper
 from clarifai.utils.logging import logger
@@ -247,6 +247,10 @@ def _parse_requirements(self):
         if match:
           dependency = match.group('dependency')
           version = match.group('version')
+          if dependency == "torch" and line.find(
+              'whl/cpu') > 0:  # Ignore torch-cpu whl files, use base mage.
+            continue
+
           deendencies_version[dependency] = version if version else None
     return deendencies_version
 
@@ -279,28 +283,37 @@ def create_dockerfile(self):
       )
       python_version = DEFAULT_PYTHON_VERSION
 
-    base_image = PYTHON_BASE_IMAGE.format(python_version=python_version)
+    # This is always the final image used for runtime.
+    runtime_image = PYTHON_RUNTIME_IMAGE.format(python_version=python_version)
+    builder_image = PYTHON_BUILDER_IMAGE.format(python_version=python_version)
+    downloader_image = PYTHON_BUILDER_IMAGE.format(python_version=python_version)
 
     # Parse the requirements.txt file to determine the base image
     dependencies = self._parse_requirements()
     if 'torch' in dependencies and dependencies['torch']:
       torch_version = dependencies['torch']
 
-      for image in AVAILABLE_TORCH_IMAGES:
+      # Sort in reverse so that newer cuda versions come first and are preferred.
+      for image in sorted(AVAILABLE_TORCH_IMAGES, reverse=True):
         if torch_version in image and f'py{python_version}' in image:
           cuda_version = image.split('-')[-1].replace('cuda', '')
-          base_image = TORCH_BASE_IMAGE.format(
+          builder_image = TORCH_BASE_IMAGE.format(
               torch_version=torch_version,
               python_version=python_version,
               cuda_version=cuda_version,
           )
+          # download_image = base_image
           logger.info(f"Using Torch version {torch_version} base image to build the Docker image")
           break
-
+    # else:  # if not torch then use the download image for the base image too
+    #   # base_image = download_image
+    #   requirements_image = base_image
     # Replace placeholders with actual values
     dockerfile_content = dockerfile_template.safe_substitute(
         name='main',
-        BASE_IMAGE=base_image,
+        BUILDER_IMAGE=builder_image,  # for pip requirements
+        RUNTIME_IMAGE=runtime_image,  # for runtime
+        DOWNLOADER_IMAGE=downloader_image,  # for downloading checkpoints
     )
 
     # Write Dockerfile
@@ -309,7 +322,10 @@ def create_dockerfile(self):
 
   @property
   def checkpoint_path(self):
-    return os.path.join(self.folder, self.checkpoint_suffix)
+    return self._checkpoint_path(self.folder)
+
+  def _checkpoint_path(self, folder):
+    return os.path.join(folder, self.checkpoint_suffix)
 
   @property
   def checkpoint_suffix(self):
@@ -319,7 +335,14 @@ def checkpoint_suffix(self):
   def tar_file(self):
     return f"{self.folder}.tar.gz"
 
-  def download_checkpoints(self):
+  def download_checkpoints(self, checkpoint_path_override: str = None):
+    """
+    Downloads the checkpoints specified in the config file.
+
+    :param checkpoint_path_override: The path to download the checkpoints to. If not provided, the
+    default path is used based on the folder ModelUploader was initialized with. The
+    checkpoint_suffix will be appended to the path.
+    """
     if not self.config.get("checkpoints"):
       logger.info("No checkpoints specified in the config file")
       return True
@@ -329,7 +352,9 @@ def download_checkpoints(self):
     success = True
     if loader_type == "huggingface":
       loader = HuggingFaceLoader(repo_id=repo_id, token=hf_token)
-      success = loader.download_checkpoints(self.checkpoint_path)
+      path = self._checkpoint_path(
+          checkpoint_path_override) if checkpoint_path_override else self.checkpoint_path
+      success = loader.download_checkpoints(path)
 
     if loader_type:
       if not success:
@@ -462,7 +487,7 @@ def filter_func(tarinfo):
           f"request_id: {response.status.req_id}",
           end='\r',
           flush=True)
-    print()
+    logger.info("")
     if response.status.code != status_code_pb2.MODEL_BUILDING:
       logger.error(f"Failed to upload model version: {response}")
       return
@@ -552,11 +577,11 @@ def monitor_model_build(self):
         for log_entry in logs.log_entries:
           if log_entry.url not in seen_logs:
             seen_logs.add(log_entry.url)
-            print(f"Model Building Logs...: {escape(log_entry.message.strip())}")
+            logger.info(f"{escape(log_entry.message.strip())}")
         time.sleep(1)
       elif status_code == status_code_pb2.MODEL_TRAINED:
         logger.info(f"\nModel build complete! (elapsed {time.time() - st:.1f}s)")
-        logger.info(f"Check out the model at {self.model_url}")
+        logger.info(f"Check out the model at {self.model_url} version: {self.model_version_id}")
         return True
       else:
         logger.info(

diff --git a/clarifai/runners/utils/const.py b/clarifai/runners/utils/const.py
@@ -2,29 +2,26 @@
 
 registry = os.environ.get('CLARIFAI_BASE_IMAGE_REGISTRY', 'public.ecr.aws/clarifai-models')
 
-PYTHON_BASE_IMAGE = registry + '/python-base:{python_version}'
-TORCH_BASE_IMAGE = registry + '/torch:{torch_version}-py{python_version}-cuda{cuda_version}'
+PYTHON_BUILDER_IMAGE = registry + '/python-base:builder-{python_version}'
+PYTHON_RUNTIME_IMAGE = registry + '/python-base:runtime-{python_version}'
+TORCH_BASE_IMAGE = registry + '/torch:builder-{torch_version}-py{python_version}-cuda{cuda_version}'
 
 # List of available python base images
 AVAILABLE_PYTHON_IMAGES = ['3.11', '3.12']
 
 DEFAULT_PYTHON_VERSION = 3.12
 
 # List of available torch images
+# Keep sorted by most recent cuda version.
 AVAILABLE_TORCH_IMAGES = [
     '2.4.0-py3.11-cuda124',
     '2.4.1-py3.11-cuda124',
     '2.5.1-py3.11-cuda124',
     '2.4.0-py3.12-cuda124',
     '2.4.1-py3.12-cuda124',
     '2.5.1-py3.12-cuda124',
-    # '2.2.2-py3.13-cuda121',
-    # '2.3.1-py3.13-cuda121',
-    # '2.4.0-py3.13-cuda121',
     # '2.4.0-py3.13-cuda124',
-    # '2.4.1-py3.13-cuda121',
     # '2.4.1-py3.13-cuda124',
-    # '2.5.1-py3.13-cuda121',
     # '2.5.1-py3.13-cuda124',
 ]
 CONCEPTS_REQUIRED_MODEL_TYPE = [

diff --git a/clarifai/runners/utils/loader.py b/clarifai/runners/utils/loader.py
@@ -3,7 +3,6 @@
 import json
 import os
 import shutil
-import subprocess
 
 from clarifai.utils.logging import logger
 
@@ -17,7 +16,11 @@ def __init__(self, repo_id=None, token=None):
     self.token = token
     if token:
       if self.validate_hftoken(token):
-        subprocess.run(f'huggingface-cli login --token={os.environ["HF_TOKEN"]}', shell=True)
+        try:
+          from huggingface_hub import login
+        except ImportError:
+          raise ImportError(self.HF_DOWNLOAD_TEXT)
+        login(token=token)
         logger.info("Hugging Face token validated")
       else:
         logger.info("Continuing without Hugging Face token")

diff --git a/clarifai/utils/cli.py b/clarifai/utils/cli.py
@@ -46,7 +46,7 @@ def load_command_modules():
   package_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'cli')
 
   for _, module_name, _ in pkgutil.iter_modules([package_dir]):
-    if module_name != 'base':  # Skip the base.py file itself
+    if module_name not in ['base', '__main__']:  # Skip the base.py and __main__ file itself
       importlib.import_module(f'clarifai.cli.{module_name}')
 
 

diff --git a/clarifai/utils/logging.py b/clarifai/utils/logging.py
@@ -143,8 +143,8 @@ def _configure_logger(name: str, logger_level: Union[int, str] = logging.NOTSET)
   else:
     # Add the new rich handler and formatter
     handler = RichHandler(
-        rich_tracebacks=True, log_time_format="%Y-%m-%d %H:%M:%S", console=Console(width=255))
-    formatter = logging.Formatter('%(name)s:  %(message)s')
+        rich_tracebacks=True, log_time_format="%Y-%m-%d %H:%M:%S.%f", console=Console(width=255))
+    formatter = logging.Formatter('%(message)s')
     handler.setFormatter(formatter)
     logger.addHandler(handler)