diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index 6c9bacf656..336e44ef54 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -22,13 +22,7 @@ jobs: fail-fast: false matrix: python-version: ["3.9", "3.13"] - test_name: - [ - "Repository only", - "Everything else", - "Inference only", - "Xet only" - ] + test_name: ["Everything else", "Inference only", "Xet only"] include: - python-version: "3.13" # LFS not ran on 3.9 test_name: "lfs" @@ -65,7 +59,7 @@ jobs: case "${{ matrix.test_name }}" in - "Repository only" | "Everything else" | "Inference only") + "Everything else" | "Inference only") sudo apt update sudo apt install -y libsndfile1-dev ;; @@ -112,13 +106,6 @@ jobs: case "${{ matrix.test_name }}" in - "Repository only") - # Run repo tests concurrently - PYTEST="$PYTEST ../tests -k 'TestRepository' -n 4" - echo $PYTEST - eval $PYTEST - ;; - "Inference only") # Run inference tests concurrently PYTEST="$PYTEST ../tests -k 'test_inference' -n 4" diff --git a/docs/source/cn/_toctree.yml b/docs/source/cn/_toctree.yml index b4949efa35..db6d3244a9 100644 --- a/docs/source/cn/_toctree.yml +++ b/docs/source/cn/_toctree.yml @@ -20,7 +20,4 @@ title: 概览 - local: guides/hf_file_system title: Hugging Face 文件系统 -- title: "concepts" - sections: - - local: concepts/git_vs_http - title: Git vs HTTP 范式 + diff --git a/docs/source/cn/concepts/git_vs_http.md b/docs/source/cn/concepts/git_vs_http.md deleted file mode 100644 index b582b5f991..0000000000 --- a/docs/source/cn/concepts/git_vs_http.md +++ /dev/null @@ -1,40 +0,0 @@ - - -# Git 与 HTTP 范式 - -`huggingface_hub`库是用于与Hugging Face Hub进行交互的库,Hugging Face Hub是一组基于Git的存储库(模型、数据集或Spaces)。使用 `huggingface_hub`有两种主要方式来访问Hub。 - -第一种方法,即所谓的“基于git”的方法,由[`Repository`]类驱动。这种方法使用了一个包装器,它在 `git`命令的基础上增加了专门与Hub交互的额外函数。第二种选择,称为“基于HTTP”的方法,涉及使用[`HfApi`]客户端进行HTTP请求。让我们来看一看每种方法的优缺点。 - -## 存储库:基于历史的 Git 方法 - -最初,`huggingface_hub`主要围绕 [`Repository`] 类构建。它为常见的 `git` 命令(如 `"git add"`、`"git commit"`、`"git push"`、`"git tag"`、`"git checkout"` 等)提供了 Python 包装器 - -该库还可以帮助设置凭据和跟踪大型文件,这些文件通常在机器学习存储库中使用。此外,该库允许您在后台执行其方法,使其在训练期间上传数据很有用。 - -使用 [`Repository`] 的最大优点是它允许你在本地机器上维护整个存储库的本地副本。这也可能是一个缺点,因为它需要你不断更新和维护这个本地副本。这类似于传统软件开发中,每个开发人员都维护自己的本地副本,并在开发功能时推送更改。但是,在机器学习的上下文中,这可能并不总是必要的,因为用户可能只需要下载推理所需的权重,或将权重从一种格式转换为另一种格式,而无需克隆整个存储库。 - -## HfApi: 一个功能强大且方便的HTTP客户端 - -`HfApi` 被开发为本地 git 存储库的替代方案,因为本地 git 存储库在处理大型模型或数据集时可能会很麻烦。`HfApi` 提供与基于 git 的方法相同的功能,例如下载和推送文件以及创建分支和标签,但无需本地文件夹来保持同步。 - -`HfApi`除了提供 `git` 已经提供的功能外,还提供其他功能,例如: - -* 管理存储库 -* 使用缓存下载文件以进行有效的重复使用 -* 在 Hub 中搜索存储库和元数据 -* 访问社区功能,如讨论、PR和评论 -* 配置Spaces - -## 我应该使用什么?以及何时使用? - -总的来说,在大多数情况下,`HTTP 方法`是使用 huggingface_hub 的推荐方法。但是,在以下几种情况下,维护本地 git 克隆(使用 `Repository`)可能更有益: - -如果您在本地机器上训练模型,使用传统的 git 工作流程并定期推送更新可能更有效。`Repository` 被优化为此类情况,因为它能够在后台运行。 -如果您需要手动编辑大型文件,`git `是最佳选择,因为它只会将文件的差异发送到服务器。使用 `HfAPI` 客户端,每次编辑都会上传整个文件。请记住,大多数大型文件是二进制文件,因此无法从 git 差异中受益。 - -并非所有 git 命令都通过 [`HfApi`] 提供。有些可能永远不会被实现,但我们一直在努力改进并缩小差距。如果您没有看到您的用例被覆盖。 - -请在[Github](https://github.com/huggingface/huggingface_hub)打开一个 issue!我们欢迎反馈,以帮助我们与我们的用户一起构建 🤗 生态系统。 diff --git a/docs/source/cn/guides/repository.md b/docs/source/cn/guides/repository.md index d84d37938b..e414cfc3e9 100644 --- a/docs/source/cn/guides/repository.md +++ b/docs/source/cn/guides/repository.md @@ -156,91 +156,3 @@ GitRefs( >>> from huggingface_hub import move_repo >>> move_repo(from_id="Wauplin/cool-model", to_id="huggingface/cool-model") ``` - -## 管理存储库的本地副本 - -上述所有操作都可以通过HTTP请求完成。然而,在某些情况下,您可能希望在本地拥有存储库的副本,并使用您熟悉的Git命令与之交互。 - -[`Repository`] 类允许您使用类似于Git命令的函数与Hub上的文件和存储库进行交互。它是对Git和Git-LFS方法的包装,以使用您已经了解和喜爱的Git命令。在开始之前,请确保已安装Git-LFS(请参阅[此处](https://git-lfs.github.com/)获取安装说明)。 - -### 使用本地存储库 - -使用本地存储库路径实例化一个 [`Repository`] 对象: - -请运行以下代码: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="//") -``` - -### 克隆 - -`clone_from`参数将一个存储库从Hugging Face存储库ID克隆到由 `local_dir`参数指定的本地目录: - -请运行以下代码: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="w2v2", clone_from="facebook/wav2vec2-large-960h-lv60") -``` -`clone_from`还可以使用URL克隆存储库: - -请运行以下代码: - -```py ->>> repo = Repository(local_dir="huggingface-hub", clone_from="https://huggingface.co/facebook/wav2vec2-large-960h-lv60") -``` - -你可以将`clone_from`参数与[`create_repo`]结合使用,以创建并克隆一个存储库: - -请运行以下代码: - -```py ->>> repo_url = create_repo(repo_id="repo_name") ->>> repo = Repository(local_dir="repo_local_path", clone_from=repo_url) -``` - -当你克隆一个存储库时,通过在克隆时指定`git_user`和`git_email`参数,你还可以为克隆的存储库配置Git用户名和电子邮件。当用户提交到该存储库时,Git将知道提交的作者是谁。 - -请运行以下代码: - -```py ->>> repo = Repository( -... "my-dataset", -... clone_from="/", -... token=True, -... repo_type="dataset", -... git_user="MyName", -... git_email="me@cool.mail" -... ) -``` - -### 分支 - -分支对于协作和实验而不影响当前文件和代码非常重要。使用[`~Repository.git_checkout`]来在不同的分支之间切换。例如,如果你想从 `branch1`切换到 `branch2`: - -请运行以下代码: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="huggingface-hub", clone_from="/", revision='branch1') ->>> repo.git_checkout("branch2") -``` - -### 拉取 - -[`~Repository.git_pull`] 允许你使用远程存储库的更改更新当前本地分支: - -请运行以下代码: - -```py ->>> from huggingface_hub import Repository ->>> repo.git_pull() -``` - -如果你希望本地的提交发生在你的分支被远程的新提交更新之后,请设置`rebase=True`: - -```py ->>> repo.git_pull(rebase=True) -``` diff --git a/docs/source/de/_toctree.yml b/docs/source/de/_toctree.yml index 48807ba0d8..2b994c7cc6 100644 --- a/docs/source/de/_toctree.yml +++ b/docs/source/de/_toctree.yml @@ -34,7 +34,3 @@ title: Integrieren einer Bibliothek - local: guides/webhooks_server title: Webhooks server -- title: "Konzeptionelle Anleitungen" - sections: - - local: concepts/git_vs_http - title: Git vs. HTTP-Paradigma diff --git a/docs/source/de/concepts/git_vs_http.md b/docs/source/de/concepts/git_vs_http.md deleted file mode 100644 index 978123762a..0000000000 --- a/docs/source/de/concepts/git_vs_http.md +++ /dev/null @@ -1,69 +0,0 @@ - - -# Git vs. HTTP-Paradigma - -Die `huggingface_hub`-Bibliothek ist eine Bibliothek zur Interaktion mit dem Hugging Face -Hub, einer Sammlung von auf Git basierenden Repositories (Modelle, Datensätze oder -Spaces). Es gibt zwei Hauptmethoden, um auf den Hub mit `huggingface_hub` zuzugreifen. - -Der erste Ansatz, der sogenannte "Git-basierte" Ansatz, wird von der [`Repository`] Klasse -geleitet. Diese Methode verwendet einen Wrapper um den `git`-Befehl mit zusätzlichen -Funktionen, die speziell für die Interaktion mit dem Hub entwickelt wurden. Die zweite -Option, die als "HTTP-basierter" Ansatz bezeichnet wird, umfasst das Senden von -HTTP-Anfragen mit dem [`HfApi`] Client. Schauen wir uns die Vor- und Nachteile jeder -Methode an. - -## Repository: Der historische git-basierte Ansatz - -Ursprünglich wurde `huggingface_hub` größtenteils um die [`Repository`] Klasse herum -entwickelt. Sie bietet Python-Wrapper für gängige git-Befehle wie `"git add"`, `"git commit"`, -`"git push"`, `"git tag"`, `"git checkout"` usw. - -Die Bibliothek hilft auch beim Festlegen von Zugangsdaten und beim Tracking von großen -Dateien, die in Machine-Learning-Repositories häufig verwendet werden. Darüber hinaus -ermöglicht die Bibliothek das Ausführen ihrer Methoden im Hintergrund, was nützlich ist, -um Daten während des Trainings hochzuladen. - -Der Hauptvorteil bei der Verwendung einer [`Repository`] besteht darin, dass Sie eine -lokale Kopie des gesamten Repositorys auf Ihrem Computer pflegen können. Dies kann jedoch -auch ein Nachteil sein, da es erfordert, diese lokale Kopie ständig zu aktualisieren und -zu pflegen. Dies ähnelt der traditionellen Softwareentwicklung, bei der jeder Entwickler -eine eigene lokale Kopie pflegt und Änderungen überträgt, wenn an einer Funktion -gearbeitet wird. Im Kontext des Machine Learning ist dies jedoch nicht immer erforderlich, -da Benutzer möglicherweise nur Gewichte für die Inferenz herunterladen oder Gewichte von -einem Format in ein anderes konvertieren müssen, ohne das gesamte Repository zu klonen. - -## HfApi: Ein flexibler und praktischer HTTP-Client - -Die [`HfApi`] Klasse wurde entwickelt, um eine Alternative zu lokalen Git-Repositories -bereitzustellen, die besonders bei der Arbeit mit großen Modellen oder Datensätzen -umständlich zu pflegen sein können. Die [`HfApi`] Klasse bietet die gleiche Funktionalität -wie git-basierte Ansätze, wie das Herunterladen und Hochladen von Dateien sowie das -Erstellen von Branches und Tags, jedoch ohne die Notwendigkeit eines lokalen Ordners, der -synchronisiert werden muss. - -Zusätzlich zu den bereits von `git` bereitgestellten Funktionen bietet die [`HfApi`] -Klasse zusätzliche Features wie die Möglichkeit, Repositories zu verwalten, Dateien mit -Caching für effiziente Wiederverwendung herunterzuladen, im Hub nach Repositories und -Metadaten zu suchen, auf Community-Funktionen wie Diskussionen, Pull Requests und -Kommentare zuzugreifen und Spaces-Hardware und Geheimnisse zu konfigurieren. - -## Was sollte ich verwenden ? Und wann ? - -Insgesamt ist der **HTTP-basierte Ansatz in den meisten Fällen die empfohlene Methode zur Verwendung von** -`huggingface_hub`. Es gibt jedoch einige Situationen, in denen es vorteilhaft sein kann, -eine lokale Git-Kopie (mit [`Repository`]) zu pflegen: -- Wenn Sie ein Modell auf Ihrem Computer trainieren, kann es effizienter sein, einen -herkömmlichen git-basierten Workflow zu verwenden und regelmäßige Updates zu pushen. -[`Repository`] ist für diese Art von Situation mit seiner Fähigkeit zur Hintergrundarbeit optimiert. -- Wenn Sie große Dateien manuell bearbeiten müssen, ist `git` die beste Option, da es nur -die Differenz an den Server sendet. Mit dem [`HfAPI`] Client wird die gesamte Datei bei -jeder Bearbeitung hochgeladen. Beachten Sie jedoch, dass die meisten großen Dateien binär -sind und daher sowieso nicht von Git-Diffs profitieren. - -Nicht alle Git-Befehle sind über [`HfApi`] verfügbar. Einige werden vielleicht nie -implementiert, aber wir bemühen uns ständig, die Lücken zu schließen und zu verbessern. -Wenn Sie Ihren Anwendungsfall nicht abgedeckt sehen, öffnen Sie bitte [ein Issue auf -Github](https://github.com/huggingface/huggingface_hub)! Wir freuen uns über Feedback, um das 🤗-Ökosystem mit und für unsere Benutzer aufzubauen. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 4c03a41c7b..3f930fb448 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -54,8 +54,6 @@ title: Authentication - local: package_reference/environment_variables title: Environment variables - - local: package_reference/repository - title: Managing local and online repositories - local: package_reference/hf_api title: Hugging Face Hub API - local: package_reference/file_download diff --git a/docs/source/en/guides/repository.md b/docs/source/en/guides/repository.md index 9943514ddf..6a9ce1f9ff 100644 --- a/docs/source/en/guides/repository.md +++ b/docs/source/en/guides/repository.md @@ -178,85 +178,3 @@ that you should be aware of. For example, you can't transfer your repo to anothe >>> from huggingface_hub import move_repo >>> move_repo(from_id="Wauplin/cool-model", to_id="huggingface/cool-model") ``` - -## Manage a local copy of your repository - -All the actions described above can be done using HTTP requests. However, in some cases you might be interested in having -a local copy of your repository and interact with it using the Git commands you are familiar with. - -The [`Repository`] class allows you to interact with files and repositories on the Hub with functions similar to Git commands. It is a wrapper over Git and Git-LFS methods to use the Git commands you already know and love. Before starting, please make sure you have Git-LFS installed (see [here](https://git-lfs.github.com/) for installation instructions). - - - -[`Repository`] is deprecated in favor of the http-based alternatives implemented in [`HfApi`]. Given its large adoption in legacy code, the complete removal of [`Repository`] will only happen in release `v1.0`. For more details, please read [this explanation page](./concepts/git_vs_http). - - - -### Use a local repository - -Instantiate a [`Repository`] object with a path to a local repository: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="//") -``` - -### Clone - -The `clone_from` parameter clones a repository from a Hugging Face repository ID to a local directory specified by the `local_dir` argument: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="w2v2", clone_from="facebook/wav2vec2-large-960h-lv60") -``` - -`clone_from` can also clone a repository using a URL: - -```py ->>> repo = Repository(local_dir="huggingface-hub", clone_from="https://huggingface.co/facebook/wav2vec2-large-960h-lv60") -``` - -You can combine the `clone_from` parameter with [`create_repo`] to create and clone a repository: - -```py ->>> repo_url = create_repo(repo_id="repo_name") ->>> repo = Repository(local_dir="repo_local_path", clone_from=repo_url) -``` - -You can also configure a Git username and email to a cloned repository by specifying the `git_user` and `git_email` parameters when you clone a repository. When users commit to that repository, Git will be aware of the commit author. - -```py ->>> repo = Repository( -... "my-dataset", -... clone_from="/", -... token=True, -... repo_type="dataset", -... git_user="MyName", -... git_email="me@cool.mail" -... ) -``` - -### Branch - -Branches are important for collaboration and experimentation without impacting your current files and code. Switch between branches with [`~Repository.git_checkout`]. For example, if you want to switch from `branch1` to `branch2`: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="huggingface-hub", clone_from="/", revision='branch1') ->>> repo.git_checkout("branch2") -``` - -### Pull - -[`~Repository.git_pull`] allows you to update a current local branch with changes from a remote repository: - -```py ->>> from huggingface_hub import Repository ->>> repo.git_pull() -``` - -Set `rebase=True` if you want your local commits to occur after your branch is updated with the new commits from the remote: - -```py ->>> repo.git_pull(rebase=True) -``` diff --git a/docs/source/en/guides/upload.md b/docs/source/en/guides/upload.md index 69930887eb..ff91031eba 100644 --- a/docs/source/en/guides/upload.md +++ b/docs/source/en/guides/upload.md @@ -4,12 +4,7 @@ rendered properly in your Markdown viewer. # Upload files to the Hub -Sharing your files and work is an important aspect of the Hub. The `huggingface_hub` offers several options for uploading your files to the Hub. You can use these functions independently or integrate them into your library, making it more convenient for your users to interact with the Hub. This guide will show you how to push files: - -- without using Git. -- that are very large with [Git LFS](https://git-lfs.github.com/). -- with the `commit` context manager. -- with the [`~Repository.push_to_hub`] function. +Sharing your files and work is an important aspect of the Hub. The `huggingface_hub` offers several options for uploading your files to the Hub. You can use these functions independently or integrate them into your library, making it more convenient for your users to interact with the Hub. Whenever you want to upload files to the Hub, you need to log in to your Hugging Face account. For more details about authentication, check out [this section](../quick-start#authentication). @@ -486,114 +481,3 @@ update of the object is that **the binary content is removed** from it, meaning you don't store another reference to it. This is expected as we don't want to keep in memory the content that is already uploaded. Finally we create the commit by passing all the operations to [`create_commit`]. You can pass additional operations (add, delete or copy) that have not been processed yet and they will be handled correctly. - -## (legacy) Upload files with Git LFS - -All the methods described above use the Hub's API to upload files. This is the recommended way to upload files to the Hub. -However, we also provide [`Repository`], a wrapper around the git tool to manage a local repository. - - - -Although [`Repository`] is not formally deprecated, we recommend using the HTTP-based methods described above instead. -For more details about this recommendation, please have a look at [this guide](../concepts/git_vs_http) explaining the -core differences between HTTP-based and Git-based approaches. - - - -Git LFS automatically handles files larger than 10MB. But for very large files (>5GB), you need to install a custom transfer agent for Git LFS: - -```bash -hf lfs-enable-largefiles -``` - -You should install this for each repository that has a very large file. Once installed, you'll be able to push files larger than 5GB. - -### commit context manager - -The `commit` context manager handles four of the most common Git commands: pull, add, commit, and push. `git-lfs` automatically tracks any file larger than 10MB. In the following example, the `commit` context manager: - -1. Pulls from the `text-files` repository. -2. Adds a change made to `file.txt`. -3. Commits the change. -4. Pushes the change to the `text-files` repository. - -```python ->>> from huggingface_hub import Repository ->>> with Repository(local_dir="text-files", clone_from="/text-files").commit(commit_message="My first file :)"): -... with open("file.txt", "w+") as f: -... f.write(json.dumps({"hey": 8})) -``` - -Here is another example of how to use the `commit` context manager to save and upload a file to a repository: - -```python ->>> import torch ->>> model = torch.nn.Transformer() ->>> with Repository("torch-model", clone_from="/torch-model", token=True).commit(commit_message="My cool model :)"): -... torch.save(model.state_dict(), "model.pt") -``` - -Set `blocking=False` if you would like to push your commits asynchronously. Non-blocking behavior is helpful when you want to continue running your script while your commits are being pushed. - -```python ->>> with repo.commit(commit_message="My cool model :)", blocking=False) -``` - -You can check the status of your push with the `command_queue` method: - -```python ->>> last_command = repo.command_queue[-1] ->>> last_command.status -``` - -Refer to the table below for the possible statuses: - -| Status | Description | -| -------- | ------------------------------------ | -| -1 | The push is ongoing. | -| 0 | The push has completed successfully. | -| Non-zero | An error has occurred. | - -When `blocking=False`, commands are tracked, and your script will only exit when all pushes are completed, even if other errors occur in your script. Some additional useful commands for checking the status of a push include: - -```python -# Inspect an error. ->>> last_command.stderr - -# Check whether a push is completed or ongoing. ->>> last_command.is_done - -# Check whether a push command has errored. ->>> last_command.failed -``` - -### push_to_hub - -The [`Repository`] class has a [`~Repository.push_to_hub`] function to add files, make a commit, and push them to a repository. Unlike the `commit` context manager, you'll need to pull from a repository first before calling [`~Repository.push_to_hub`]. - -For example, if you've already cloned a repository from the Hub, then you can initialize the `repo` from the local directory: - -```python ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="path/to/local/repo") -``` - -Update your local clone with [`~Repository.git_pull`] and then push your file to the Hub: - -```py ->>> repo.git_pull() ->>> repo.push_to_hub(commit_message="Commit my-awesome-file to the Hub") -``` - -However, if you aren't ready to push a file yet, you can use [`~Repository.git_add`] and [`~Repository.git_commit`] to only add and commit your file: - -```py ->>> repo.git_add("path/to/file") ->>> repo.git_commit(commit_message="add my first model config file :)") -``` - -When you're ready, push the file to your repository with [`~Repository.git_push`]: - -```py ->>> repo.git_push() -``` diff --git a/docs/source/en/package_reference/repository.md b/docs/source/en/package_reference/repository.md deleted file mode 100644 index de7851d6a9..0000000000 --- a/docs/source/en/package_reference/repository.md +++ /dev/null @@ -1,51 +0,0 @@ - - -# Managing local and online repositories - -The `Repository` class is a helper class that wraps `git` and `git-lfs` commands. It provides tooling adapted -for managing repositories which can be very large. - -It is the recommended tool as soon as any `git` operation is involved, or when collaboration will be a point -of focus with the repository itself. - -## The Repository class - -[[autodoc]] Repository - - __init__ - - current_branch - - all - -## Helper methods - -[[autodoc]] huggingface_hub.repository.is_git_repo - -[[autodoc]] huggingface_hub.repository.is_local_clone - -[[autodoc]] huggingface_hub.repository.is_tracked_with_lfs - -[[autodoc]] huggingface_hub.repository.is_git_ignored - -[[autodoc]] huggingface_hub.repository.files_to_be_staged - -[[autodoc]] huggingface_hub.repository.is_tracked_upstream - -[[autodoc]] huggingface_hub.repository.commits_to_push - -## Following asynchronous commands - -The `Repository` utility offers several methods which can be launched asynchronously: -- `git_push` -- `git_pull` -- `push_to_hub` -- The `commit` context manager - -See below for utilities to manage such asynchronous methods. - -[[autodoc]] Repository - - commands_failed - - commands_in_progress - - wait_for_commands - -[[autodoc]] huggingface_hub.repository.CommandInProgress diff --git a/docs/source/fr/_toctree.yml b/docs/source/fr/_toctree.yml index f6c76ff6f5..d9ed776e0a 100644 --- a/docs/source/fr/_toctree.yml +++ b/docs/source/fr/_toctree.yml @@ -6,10 +6,6 @@ title: Démarrage rapide - local: installation title: Installation -- title: "Concepts" - sections: - - local: concepts/git_vs_http - title: Git ou HTTP? - title: "Guides" sections: - local: guides/integrations diff --git a/docs/source/fr/concepts/git_vs_http.md b/docs/source/fr/concepts/git_vs_http.md deleted file mode 100644 index 8ccc31b69c..0000000000 --- a/docs/source/fr/concepts/git_vs_http.md +++ /dev/null @@ -1,67 +0,0 @@ - - -# Git ou HTTP? - -`huggingface_hub` est une librairie qui permet d'interagir avec le Hugging Face Hub, -qui est une collection de dépots Git (modèles, datasets ou spaces). -Il y a deux manières principales pour accéder au Hub en utilisant `huggingface_hub`. - -La première approche, basée sur Git, appelée approche "git-based", est rendue possible par la classe [`Repository`]. -Cette méthode utilise un wrapper autour de la commande `git` avec des fonctionnalités supplémentaires conçues pour interagir avec le Hub. La deuxième option, appelée approche "HTTP-based" , consiste à faire des requêtes HTTP en utilisant le client [`HfApi`]. Examinons -les avantages et les inconvénients de ces deux méthodes. - -## Repository: l'approche historique basée sur git - -Initialement, `huggingface_hub` était principalement construite autour de la classe [`Repository`]. Elle fournit des -wrappers Python pour les commandes `git` usuelles, telles que `"git add"`, `"git commit"`, `"git push"`, -`"git tag"`, `"git checkout"`, etc. - -Cette librairie permet aussi de gérer l'authentification et les fichiers volumineux, souvent présents dans les dépôts Git de machine learning. De plus, ses méthodes sont exécutables en arrière-plan, ce qui est utile pour upload des données durant l'entrainement d'un modèle. - -L'avantage principal de l'approche [`Repository`] est qu'elle permet de garder une -copie en local du dépot Git sur votre machine. Cela peut aussi devenir un désavantage, -car cette copie locale doit être mise à jour et maintenue constamment. C'est une méthode -analogue au développement de logiciel classique où chaque développeur maintient sa propre copie locale -et push ses changements lorsqu'il travaille sur une nouvelle fonctionnalité. -Toutefois, dans le contexte du machine learning la taille des fichiers rend peu pertinente cette approche car -les utilisateurs ont parfois besoin d'avoir -uniquement les poids des modèles pour l'inférence ou de convertir ces poids d'un format à un autre sans avoir à cloner -tout le dépôt. - - - -[`Repository`] est maintenant obsolète et remplacée par les alternatives basées sur des requêtes HTTP. Étant donné son adoption massive par les utilisateurs, -la suppression complète de [`Repository`] ne sera faite que pour la version `v1.0`. - - - -## HfApi: Un client HTTP plus flexible - -La classe [`HfApi`] a été développée afin de fournir une alternative aux dépôts git locaux, -qui peuvent être encombrant à maintenir, en particulier pour des modèles ou datasets volumineux. -La classe [`HfApi`] offre les mêmes fonctionnalités que les approches basées sur Git, -telles que le téléchargement et le push de fichiers ainsi que la création de branches et de tags, mais sans -avoir besoin d'un fichier local qui doit être constamment synchronisé. - -En plus des fonctionnalités déjà fournies par `git`, La classe [`HfApi`] offre des fonctionnalités -additionnelles, telles que la capacité à gérer des dépôts, le téléchargement des fichiers -dans le cache (permettant une réutilisation), la recherche dans le Hub pour trouver -des dépôts et des métadonnées, l'accès aux fonctionnalités communautaires telles que, les discussions, -les pull requests et les commentaires. - -## Quelle méthode utiliser et quand ? - -En général, **l'approche HTTP est la méthode recommandée** pour utiliser `huggingface_hub` -[`HfApi`] permet de pull et push des changements, de travailler avec les pull requests, les tags et les branches, l'interaction avec les discussions -et bien plus encore. Depuis la version `0.16`, les méthodes HTTP-based peuvent aussi être exécutées en arrière-plan, ce qui constituait le -dernier gros avantage de la classe [`Repository`]. - -Toutefois, certaines commandes restent indisponibles en utilisant [`HfApi`]. -Peut être que certaines ne le seront jamais, mais nous essayons toujours de réduire le fossé entre ces deux approches. -Si votre cas d'usage n'est pas couvert, nous serions ravis de vous aider. Pour cela, ouvrez -[une issue sur Github](https://github.com/huggingface/huggingface_hub)! Nous écoutons tous les retours nous permettant de construire -l'écosystème 🤗 avec les utilisateurs et pour les utilisateurs. - -Cette préférence pour l'approche basée sur [`HfApi`] plutôt que [`Repository`] ne signifie pas que les dépôts stopperons d'être versionnés avec git sur le Hugging Face Hub. Il sera toujours possible d'utiliser les commandes `git` en local lorsque nécessaire. \ No newline at end of file diff --git a/docs/source/hi/_toctree.yml b/docs/source/hi/_toctree.yml index 5b9e412c50..f8b3606536 100644 --- a/docs/source/hi/_toctree.yml +++ b/docs/source/hi/_toctree.yml @@ -6,7 +6,3 @@ title: जल्दी शुरू - local: installation title: इंस्टालेशन -- title: "संकल्पना मार्गदर्शिकाएँ" - sections: - - local: concepts/git_vs_http - title: "संकल्पनाएँ/गिट_बनाम_एचटीटीपी" diff --git a/docs/source/hi/concepts/git_vs_http.md b/docs/source/hi/concepts/git_vs_http.md deleted file mode 100644 index ebb3574352..0000000000 --- a/docs/source/hi/concepts/git_vs_http.md +++ /dev/null @@ -1,33 +0,0 @@ -# Git vs HTTP पैराडाइम - -`huggingface_hub` लाइब्रेरी Hugging Face Hub के साथ आदान-प्रदान करने के लिए एक लाइब्रेरी है, जो git-आधारित repositories (models, datasets या Spaces) का एक संग्रह है। `huggingface_hub` का उपयोग करके Hub तक पहुंचने के दो मुख्य तरीके हैं। - -पहला तरीका, जिसे "git-आधारित" तरीका कहा जाता है, [`Repository`] क्लास द्वारा संचालित है। यह विधि `git` कमांड के चारों ओर एक आवरण का उपयोग करती है जिसमें Hub के साथ आदान-प्रदान करने के लिए विशेष रूप से डिज़ाइन किए गए अतिरिक्त functions हैं। दूसरा विकल्प, जिसे "HTTP-आधारित" तरीका कहा जाता है, [`HfApi`] client का उपयोग करके HTTP requests बनाने में शामिल है। आइए प्रत्येक तरीका के फायदे और नुकसान की जांच करते हैं। - -## Repository: ऐतिहासिक git-आधारित तरीका - -शुरुआत में, `huggingface_hub` मुख्य रूप से [`Repository`] क्लास के चारों ओर बनाया गया था। यह सामान्य `git` कमांड जैसे `"git add"`, `"git commit"`, `"git push"`, `"git tag"`, `"git checkout"`, आदि के लिए Python wrappers प्रदान करता है। - -लाइब्रेरी विवरण सेट करने और बड़ी फाइलों को track करने में भी मदद करती है, जो अक्सर machine learning repositories में उपयोग की जाती हैं। इसके अतिरिक्त, लाइब्रेरी आपको अपनी विधियों को पृष्ठभूमि में कार्यान्वित करने की अनुमति देती है, जो training के दौरान डेटा अपलोड करने के लिए उपयोगी है। - -[`Repository`] का उपयोग करने का मुख्य फायदा यह है कि यह आपको अपनी मशीन पर संपूर्ण repository की एक local copy बनाए रखने की अनुमति देता है। यह एक नुकसान भी हो सकता है क्योंकि इसके लिए आपको इस local copy को लगातार update और maintain करना होता है। यह पारंपरिक software development के समान है जहां प्रत्येक developer अपनी स्वयं की local copy maintain करता है और feature पर काम करते समय changes push करता है। हालांकि, machine learning के संदर्भ में, यह हमेशा आवश्यक नहीं हो सकता क्योंकि users को केवल inference के लिए weights download करने या weights को एक format से दूसरे में convert करने की आवश्यकता हो सकती है, बिना पूरी repository को clone करने की आवश्यकता के। - - - -[`Repository`] अब http-आधारित विकल्पों के पक्ष में deprecated है। legacy code में इसकी बड़ी अपनाई जाने के कारण, [`Repository`] का पूर्ण removal केवल `v1.0` release में होगा। - - - -## HfApi: एक लचीला और सुविधाजनक HTTP client - -[`HfApi`] क्लास को local git repositories का एक विकल्प प्रदान करने के लिए विकसित किया गया था, जो maintain करना मुश्किल हो सकता है, विशेष रूप से बड़े models या datasets के साथ व्यवहार करते समय। [`HfApi`] क्लास git-आधारित तरीकाों की समान functionality प्रदान करती है, जैसे files download और push करना और branches तथा tags बनाना, लेकिन एक local folder की आवश्यकता के बिना जिसे sync में रखना पड़ता है। - -`git` द्वारा पहले से प्रदान की गई functionalities के अलावा, [`HfApi`] क्लास अतिरिक्त features प्रदान करती है, जैसे repos manage करने की क्षमता, efficient reuse के लिए caching का उपयोग करके files download करना, repos और metadata के लिए Hub को search करना, discussions, PRs, और comments जैसी community features तक पहुंच, और Spaces hardware और secrets को configure करना। - -## मुझे क्या उपयोग करना चाहिए? और कब? - -कुल मिलाकर, **HTTP-आधारित तरीका सभी cases में** `huggingface_hub` का उपयोग करने का **अनुशंसित तरीका है**। [`HfApi`] changes को pull और push करने, PRs, tags और branches के साथ काम करने, discussions के साथ interact करने और बहुत कुछ करने की अनुमति देता है। `0.16` release के बाद से, http-आधारित methods भी पृष्ठभूमि में चल सकती हैं, जो [`Repository`] क्लास का अंतिम प्रमुख फायदा था। - -हालांकि, सभी git commands [`HfApi`] के माध्यम से उपलब्ध नहीं हैं। कुछ को कभी भी implement नहीं किया जा सकता है, लेकिन हम हमेशा सुधार करने और gap को बंद करने की कोशिश कर रहे हैं। यदि आपको अपना use case covered नहीं दिखता है, तो कृपया [Github पर एक issue खोलें](https://github.com/huggingface/huggingface_hub)! हम अपने users के साथ और उनके लिए 🤗 ecosystem बनाने में मदद करने के लिए feedback का स्वागत करते हैं। - -git-आधारित [`Repository`] पर http-आधारित [`HfApi`] की यह प्राथमिकता का मतलब यह नहीं है कि git versioning Hugging Face Hub से जल्द ही गायब हो जाएगी। workflows में जहां यह समझ में आता है, वहां `git` commands का locally उपयोग करना हमेशा संभव होगा। \ No newline at end of file diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 2c7a4da702..0a82cd72db 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -40,10 +40,6 @@ title: 라이브러리 통합 - local: guides/webhooks_server title: 웹훅 서버 -- title: "개념 가이드" - sections: - - local: concepts/git_vs_http - title: Git 대 HTTP 패러다임 - title: "라이브러리 레퍼런스" sections: - local: package_reference/overview @@ -52,8 +48,6 @@ title: 로그인 및 로그아웃 - local: package_reference/environment_variables title: 환경 변수 - - local: package_reference/repository - title: 로컬 및 온라인 리포지토리 관리 - local: package_reference/hf_api title: 허깅페이스 Hub API - local: package_reference/file_download diff --git a/docs/source/ko/concepts/git_vs_http.md b/docs/source/ko/concepts/git_vs_http.md deleted file mode 100644 index 7f2bd9933f..0000000000 --- a/docs/source/ko/concepts/git_vs_http.md +++ /dev/null @@ -1,53 +0,0 @@ - - -# Git 대 HTTP 패러다임 - -`huggingface_hub` 라이브러리는 git 기반의 저장소(Models, Datasets 또는 Spaces)로 구성된 Hugging Face Hub과 상호 작용하기 위한 라이브러리입니다. -`huggingface_hub`를 사용하여 Hub에 접근하는 방법은 크게 두 가지입니다. - -첫 번째 접근 방식인 소위 "git 기반" 접근 방식은 [`Repository`] 클래스가 주도합니다. -이 방법은 허브와 상호 작용하도록 특별히 설계된 추가 기능이 있는 `git` 명령에 랩퍼를 사용합니다. -두 번째 방법은 "HTTP 기반" 접근 방식이며, [`HfApi`] 클라이언트를 사용하여 HTTP 요청을 수행합니다. -각 방법의 장단점을 살펴보겠습니다. - -## Repository: 역사적인 Git 기반 접근 방식 - -먼저, `huggingface_hub`는 주로 [`Repository`] 클래스를 기반으로 구축되었습니다. -이 클래스는 `"git add"`, `"git commit"`, `"git push"`, `"git tag"`, `"git checkout"` 등과 같은 일반적인 `git` 명령에 대한 Python 랩퍼를 제공합니다. - -이 라이브러리는 머신러닝 저장소에서 자주 사용되는 큰 파일을 추적하고 자격 증명을 설정하는 데 도움이 됩니다. -또한, 이 라이브러리는 백그라운드에서 메소드를 실행할 수 있어, 훈련 중에 데이터를 업로드할 때 유용합니다. - -로컬 머신에 전체 저장소의 로컬 복사본을 유지할 수 있다는 것은 [`Repository`]를 사용하는 가장 큰 장점입니다. -하지만 동시에 로컬 복사본을 지속적으로 업데이트하고 유지해야 한다는 단점이 될 수도 있습니다. -이는 각 개발자가 자체 로컬 복사본을 유지하고 기능을 개발할 때 변경 사항을 push하는 전통적인 소프트웨어 개발과 유사합니다. -그러나 머신러닝의 경우, 사용자가 전체 저장소를 복제할 필요 없이 추론을 위해 가중치만 다운로드하거나 가중치를 한 형식에서 다른 형식으로 변환하기만 하면 되기 때문에 이런 방식이 항상 필요한 것은 아닙니다. - - - -[`Repository`]는 지원이 중단될 예정이므로 HTTP 기반 대안을 사용하는 것을 권장합니다. 기존 코드에서 널리 사용되기 때문에 [`Repository`]의 완전한 제거는 릴리스 `v1.0`에서 이루어질 예정입니다. - - - -## HfApi: 유연하고 편리한 HTTP 클라이언트 - -[`HfApi`] 클래스는 특히 큰 모델이나 데이터셋을 처리할 때 유지하기 어려운 로컬 git 저장소의 대안으로 개발되었습니다. -[`HfApi`] 클래스는 파일 다운로드 및 push, 브랜치 및 태그 생성과 같은 git 기반 접근 방식과 동일한 기능을 제공하지만, 동기화 상태를 유지해야 하는 로컬 폴더가 필요하지 않습니다. - -[`HfApi`] 클래스는 `git`이 제공하는 기능 외에도 추가적인 기능을 제공합니다. -저장소를 관리하고, 효율적인 재사용을 위해 캐싱을 사용하여 파일을 다운로드하고, Hub에서 저장소 및 메타데이터를 검색하고, 토론, PR 및 코멘트와 같은 커뮤니티 기능에 접근하고, Spaces 하드웨어 및 시크릿을 구성할 수 있습니다. - -## 무엇을 사용해야 하나요? 언제 사용하나요? - -전반적으로, **HTTP 기반 접근 방식은 모든 경우에** `huggingface_hub`를 사용하는 것이 좋습니다. -[`HfApi`]를 사용하면 변경 사항을 pull하고 push하고, PR, 태그 및 브랜치로 작업하고, 토론과 상호 작용하는 등의 작업을 할 수 있습니다. -`0.16` 릴리스부터는 [`Repository`] 클래스의 마지막 주요 장점이었던 http 기반 메소드도 백그라운드에서 실행할 수 있습니다. - -그러나 모든 git 명령이 [`HfApi`]를 통해 사용 가능한 것은 아닙니다. 일부는 구현되지 않을 수도 있지만, 저희는 항상 개선하고 격차를 줄이기 위해 노력하고 있습니다. -사용 사례에 해당되지 않는 경우, [Github에서 이슈](https://github.com/huggingface/huggingface_hub)를 개설해 주세요! -사용자와 함께, 사용자를 위한 🤗 생태계를 구축하는 데 도움이 되는 피드백을 환영합니다. - -git 기반 [`Repository`]보다 http 기반 [`HfApi`]를 선호한다고 해서 Hugging Face Hub에서 git 버전 관리가 바로 사라지는 것은 아닙니다. -워크플로우 상 합당하다면 언제든 로컬에서 `git` 명령을 사용할 수 있습니다. diff --git a/docs/source/ko/guides/repository.md b/docs/source/ko/guides/repository.md deleted file mode 100644 index 7544608d1c..0000000000 --- a/docs/source/ko/guides/repository.md +++ /dev/null @@ -1,223 +0,0 @@ - - -# 리포지토리 생성과 관리[[create-and-manage-a-repository]] - -Hugging Face Hub는 Git 리포지토리 모음입니다. [Git](https://git-scm.com/)은 협업을 할 때 여러 프로젝트 버전을 쉽게 관리하기 위해 널리 사용되는 소프트웨어 개발 도구입니다. 이 가이드에서는 Hub의 리포지토리 사용법인 다음 내용을 다룹니다: - -- 리포지토리 생성과 삭제. -- 태그 및 브랜치 관리. -- 리포지토리 이름 변경. -- 리포지토리 공개 여부. -- 리포지토리 복사본 관리. - - - -GitLab/GitHub/Bitbucket과 같은 플랫폼을 사용해 본 경험이 있다면, 모델 리포지토리를 관리하기 위해 `git` CLI를 사용해 git 리포지토리를 클론(`git clone`)하고 변경 사항을 커밋(`git add, git commit`)하고 커밋한 내용을 푸시(`git push`) 하는것이 가장 먼저 떠오를 것입니다. 이 명령어들은 Hugging Face Hub에서도 사용할 수 있습니다. 하지만 소프트웨어 엔지니어링과 머신러닝은 동일한 요구 사항과 워크플로우를 공유하지 않습니다. 모델 리포지토리는 다양한 프레임워크와 도구를 위한 대규모 모델 가중치 파일을 유지관리 할 수 있으므로, 리포지토리를 복제하면 대규모 로컬 폴더를 유지관리하고 막대한 크기의 파일을 다루게 될 수 있습니다. 결과적으로 Hugging Face의 커스텀 HTTP 방법을 사용하는 것이 더욱 효율적일 수 있습니다. 더 자세한 내용은 [Git vs HTTP paradigm](../concepts/git_vs_http) 문서를 참조하세요. - - - -Hub에 리포지토리를 생성하고 관리하려면, 로그인이 되어 있어야 합니다. 로그인이 안 되어있다면 [이 문서](../quick-start#authentication)를 참고해 주세요. 이 가이드에서는 로그인이 되어있다는 가정하에 진행됩니다. - -## 리포지토리 생성 및 삭제[[repo-creation-and-deletion]] - -첫 번째 단계는 어떻게 리포지토리를 생성하고 삭제하는지를 알아야 합니다. 사용자 이름 네임스페이스 아래에 소유한 리포지토리 또는 쓰기 권한이 있는 조직의 리포지토리만 관리할 수 있습니다. - -### 리포지토리 생성[[create-a-repository]] - -[`create_repo`] 함수로 함께 빈 리포지토리를 만들고 `repo_id` 매개변수를 사용하여 이름을 정하세요. `repo_id`는 사용자 이름 또는 조직 이름 뒤에 리포지토리 이름이 따라옵니다: `username_or_org/repo_name`. - -```py ->>> from huggingface_hub import create_repo ->>> create_repo("lysandre/test-model") -'https://huggingface.co/lysandre/test-model' -``` - -기본적으로 [`create_repo`]는 모델 리포지토리를 만듭니다. 하지만 `repo_type` 매개변수를 사용하여 다른 유형의 리포지토리를 지정할 수 있습니다. 예를 들어 데이터셋 리포지토리를 만들고 싶다면: - -```py ->>> from huggingface_hub import create_repo ->>> create_repo("lysandre/test-dataset", repo_type="dataset") -'https://huggingface.co/datasets/lysandre/test-dataset' -``` - -리포지토리를 만들 때, `private` 매개변수를 사용하여 가시성을 설정할 수 있습니다. - -```py ->>> from huggingface_hub import create_repo ->>> create_repo("lysandre/test-private", private=True) -``` - -추후 리포지토리 가시성을 변경하고 싶다면, [`update_repo_settings`] 함수를 이용해 바꿀 수 있습니다. - -### 리포지토리 삭제[[delete-a-repository]] - -[`delete_repo`]를 사용하여 리포지토리를 삭제할 수 있습니다. 리포지토리를 삭제하기 전에 신중히 결정하세요. 왜냐하면, 삭제하고 나서 다시 되돌릴 수 없는 프로세스이기 때문입니다! - -삭제하려는 리포지토리의 `repo_id`를 지정하세요: - -```py ->>> delete_repo(repo_id="lysandre/my-corrupted-dataset", repo_type="dataset") -``` - -### 리포지토리 복제(Spaces 전용)[[duplicate-a-repository-only-for-spaces]] - -가끔 다른 누군가의 리포지토리를 복사하여, 상황에 맞게 수정하고 싶을 때가 있습니다. 이는 [`duplicate_space`]를 사용하여 Space에 복사할 수 있습니다. 이 함수를 사용하면 리포지토리 전체를 복제할 수 있습니다. 그러나 여전히 하드웨어, 절전 시간, 리포지토리, 변수 및 비밀번호와 같은 자체 설정을 구성해야 합니다. 자세한 내용은 [Manage your Space](./manage-spaces) 문서를 참조하십시오. - -```py ->>> from huggingface_hub import duplicate_space ->>> duplicate_space("multimodalart/dreambooth-training", private=False) -RepoUrl('https://huggingface.co/spaces/nateraw/dreambooth-training',...) -``` - -## 파일 다운로드와 업로드[[upload-and-download-files]] - -이제 리포지토리를 생성했으므로, 변경 사항을 푸시하고 파일을 다운로드하는 것에 관심이 있을 것입니다. - -이 두 가지 주제는 각각 자체 가이드가 필요합니다. 리포지토리 사용하는 방법에 대해 알아보려면 [업로드](./upload) 및 [다운로드](./download) 문서를 참조하세요. - -## 브랜치와 태그[[branches-and-tags]] - -Git 리포지토리는 동일한 리포지토리의 다른 버전을 저장하기 위해 브랜치들을 사용합니다. 태그는 버전을 출시할 때와 같이 리포지토리의 특정 상태를 표시하는 데 사용될 수도 있습니다. 일반적으로 브랜치와 태그는 [git 참조](https://git-scm.com/book/en/v2/Git-Internals-Git-References) -로 참조됩니다. - -### 브랜치 생성과 태그[[create-branches-and-tags]] - -[`create_branch`]와 [`create_tag`]를 이용하여 새로운 브랜치와 태그를 생성할 수 있습니다. - -```py ->>> from huggingface_hub import create_branch, create_tag - -# `main` 브랜치를 기반으로 Space 저장소에 새 브랜치를 생성합니다. ->>> create_branch("Matthijs/speecht5-tts-demo", repo_type="space", branch="handle-dog-speaker") - -# `v0.1-release` 브랜치를 기반으로 Dataset 저장소에 태그를 생성합니다. ->>> create_tag("bigcode/the-stack", repo_type="dataset", revision="v0.1-release", tag="v0.1.1", tag_message="Bump release version.") -``` - -같은 방식으로 [`delete_branch`]와 [`delete_tag`] 함수를 사용하여 브랜치 또는 태그를 삭제할 수 있습니다. - -### 모든 브랜치와 태그 나열[[list-all-branches-and-tags]] - -[`list_repo_refs`]를 사용하여 리포지토리로부터 현재 존재하는 git 참조를 나열할 수 있습니다: - -```py ->>> from huggingface_hub import list_repo_refs ->>> list_repo_refs("bigcode/the-stack", repo_type="dataset") -GitRefs( - branches=[ - GitRefInfo(name='main', ref='refs/heads/main', target_commit='18edc1591d9ce72aa82f56c4431b3c969b210ae3'), - GitRefInfo(name='v1.1.a1', ref='refs/heads/v1.1.a1', target_commit='f9826b862d1567f3822d3d25649b0d6d22ace714') - ], - converts=[], - tags=[ - GitRefInfo(name='v1.0', ref='refs/tags/v1.0', target_commit='c37a8cd1e382064d8aced5e05543c5f7753834da') - ] -) -``` - -## 리포지토리 설정 변경[[change-repository-settings]] - -리포지토리는 구성할 수 있는 몇 가지 설정이 있습니다. 대부분의 경우 브라우저의 리포지토리 설정 페이지에서 직접 설정할 것입니다. 설정을 바꾸려면 리포지토리에 대한 쓰기 액세스 권한이 있어야 합니다(사용자 리포지토리거나, 조직의 구성원이어야 함). 이 주제에서는 `huggingface_hub`를 사용하여 프로그래밍 방식으로 구성할 수 있는 설정을 알아보겠습니다. - -Spaces를 위한 특정 설정들(하드웨어, 환경변수 등)을 구성하기 위해서는 [Manage your Spaces](../guides/manage-spaces) 문서를 참조하세요. - -### 가시성 업데이트[[update-visibility]] - -리포지토리는 공개 또는 비공개로 설정할 수 있습니다. 비공개 리포지토리는 해당 저장소의 사용자 혹은 소속된 조직의 구성원만 볼 수 있습니다. 다음과 같이 리포지토리를 비공개로 변경할 수 있습니다. - -```py ->>> from huggingface_hub import update_repo_settings ->>> update_repo_settings(repo_id=repo_id, private=True) -``` - -### 리포지토리 이름 변경[[rename-your-repository]] - -[`move_repo`]를 사용하여 Hub에 있는 리포지토리 이름을 변경할 수 있습니다. 이 함수를 사용하여 개인에서 조직 리포지토리로 이동할 수도 있습니다. 이렇게 하면 [일부 제한 사항](https://hf.co/docs/hub/repositories-settings#renaming-or-transferring-a-repo)이 있으므로 주의해야 합니다. 예를 들어, 다른 사용자에게 리포지토리를 이전할 수는 없습니다. - -```py ->>> from huggingface_hub import move_repo ->>> move_repo(from_id="Wauplin/cool-model", to_id="huggingface/cool-model") -``` - -## 리포지토리의 로컬 복사본 관리[[manage-a-local-copy-of-your-repository]] - -위에 설명한 모든 작업은 HTTP 요청을 사용하여 작업할 수 있습니다. 그러나 경우에 따라 로컬 복사본을 가지고 익숙한 Git 명령어를 사용하여 상호 작용하는 것이 편리할 수 있습니다. - -[`Repository`] 클래스는 Git 명령어와 유사한 기능을 제공하는 함수를 사용하여 Hub의 파일 및 리포지토리와 상호 작용할 수 있습니다. 이는 이미 알고 있고 좋아하는 Git 및 Git-LFS 방법을 사용하는 래퍼(wrapper)입니다. 시작하기 전에 Git-LFS가 설치되어 있는지 확인하세요([여기서](https://git-lfs.github.com/) 설치 지침을 확인할 수 있습니다). - - - -[`Repository`]는 [`HfApi`]에 구현된 HTTP 기반 대안을 선호하여 중단되었습니다. 아직 많은 레거시 코드에서 사용되고 있기 때문에 [`Repository`]가 완전히 제거되는 건 `v1.0` 릴리스에서만 이루어집니다. 자세한 내용은 [해당 설명 페이지](./concepts/git_vs_http)를 참조하세요. - - - -### 로컬 리포지토리 사용[[use-a-local-repository]] - -로컬 리포지토리 경로를 사용하여 [`Repository`] 객체를 생성하세요: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="//") -``` - -### 복제[[clone]] - -`clone_from` 매개변수는 Hugging Face 리포지토리 ID에서 로컬 디렉터리로 리포지토리를 복제합니다. 이때 `local_dir` 매개변수를 사용하여 로컬 디렉터리에 저장합니다: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="w2v2", clone_from="facebook/wav2vec2-large-960h-lv60") -``` - -`clone_from`은 URL을 사용해 리포지토리를 복제할 수 있습니다. - -```py ->>> repo = Repository(local_dir="huggingface-hub", clone_from="https://huggingface.co/facebook/wav2vec2-large-960h-lv60") -``` - -`clone_from` 매개변수를 [`create_repo`]와 결합하여 리포지토리를 만들고 복제할 수 있습니다. - -```py ->>> repo_url = create_repo(repo_id="repo_name") ->>> repo = Repository(local_dir="repo_local_path", clone_from=repo_url) -``` - -리포지토리를 복제할 때 `git_user` 및 `git_email` 매개변수를 지정함으로써 복제한 리포지토리에 Git 사용자 이름과 이메일을 설정할 수 있습니다. 사용자가 해당 리포지토리에 커밋하면 Git은 커밋 작성자를 인식합니다. - -```py ->>> repo = Repository( -... "my-dataset", -... clone_from="/", -... token=True, -... repo_type="dataset", -... git_user="MyName", -... git_email="me@cool.mail" -... ) -``` - -### 브랜치[[branch]] - -브랜치는 현재 코드와 파일에 영향을 미치지 않으면서 협업과 실험에 중요합니다.[`~Repository.git_checkout`]을 사용하여 브랜치 간에 전환할 수 있습니다. 예를 들어, `branch1`에서 `branch2`로 전환하려면: - -```py ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="huggingface-hub", clone_from="/", revision='branch1') ->>> repo.git_checkout("branch2") -``` - -### 끌어오기[[pull]] - -[`~Repository.git_pull`]은 원격 리포지토리로부터의 변경사항을 현재 로컬 브랜치에 업데이트하게 합니다. - -```py ->>> from huggingface_hub import Repository ->>> repo.git_pull() -``` - -브랜치가 원격에서의 새 커밋으로 업데이트 된 후에 로컬 커밋을 수행하고자 한다면 `rebase=True`를 설정하세요: - -```py ->>> repo.git_pull(rebase=True) -``` diff --git a/docs/source/ko/guides/upload.md b/docs/source/ko/guides/upload.md index 4f61f81faf..ced6093c5d 100644 --- a/docs/source/ko/guides/upload.md +++ b/docs/source/ko/guides/upload.md @@ -4,12 +4,7 @@ rendered properly in your Markdown viewer. # Hub에 파일 업로드하기[[upload-files-to-the-hub]] -파일과 작업물을 공유하는 것은 Hub의 주요 특성 중 하나입니다. `huggingface_hub`는 Hub에 파일을 업로드하기 위한 몇 가지 옵션을 제공합니다. 이러한 기능을 단독으로 사용하거나 라이브러리에 통합하여 해당 라이브러리의 사용자가 Hub와 더 편리하게 상호작용할 수 있도록 도울 수 있습니다. 이 가이드에서는 파일을 푸시하는 다양한 방법에 대해 설명합니다: - -- Git을 사용하지 않고 푸시하기. -- [Git LFS](https://git-lfs.github.com/)를 사용하여 매우 큰 파일을 푸시하기. -- `commit` 컨텍스트 매니저를 사용하여 푸시하기. -- [`~Repository.push_to_hub`] 함수를 사용하여 푸시하기. +파일과 작업물을 공유하는 것은 Hub의 주요 특성 중 하나입니다. `huggingface_hub`는 Hub에 파일을 업로드하기 위한 몇 가지 옵션을 제공합니다. 이러한 기능을 단독으로 사용하거나 라이브러리에 통합하여 해당 라이브러리의 사용자가 Hub와 더 편리하게 상호작용할 수 있도록 도울 수 있습니다. Hub에 파일을 업로드 하려면 허깅페이스 계정으로 로그인해야 합니다. 인증에 대한 자세한 내용은 [이 페이지](../quick-start#authentication)를 참조해 주세요. @@ -435,118 +430,3 @@ Hub에서 리포지토리를 구성하는 방법에 대한 모범 사례는 [리 자세한 내용은 [이 섹션](https://huggingface.co/docs/huggingface_hub/hf_transfer)을 참조하세요. - -## (레거시) Git LFS로 파일 업로드하기[[legacy-upload-files-with-git-lfs]] - -위에서 설명한 모든 방법은 Hub의 API를 사용하여 파일을 업로드하며, 이는 Hub에 파일을 업로드하는 데 권장되는 방법입니다. -이뿐만 아니라 로컬 리포지토리를 관리하기 위하여 git 도구의 래퍼인 [`Repository`]또한 제공합니다. - - - -[`Repository`]는 공식적으로 지원 종료된 것은 아니지만, 가급적이면 위에서 설명한 HTTP 기반 방법들을 사용할 것을 권장합니다. -이 권장 사항에 대한 자세한 내용은 HTTP 기반 방식과 Git 기반 방식 간의 핵심적인 차이점을 설명하는 [이 가이드](../concepts/git_vs_http)를 참조하세요. - - - -Git LFS는 10MB보다 큰 파일을 자동으로 처리합니다. 하지만 매우 큰 파일(5GB 이상)의 경우, Git LFS용 사용자 지정 전송 에이전트를 설치해야 합니다: - -```bash -hf lfs-enable-largefiles -``` - -매우 큰 파일이 있는 각 리포지토리에 대해 이 옵션을 설치해야 합니다. -설치가 완료되면 5GB보다 큰 파일을 푸시할 수 있습니다. - -### 커밋 컨텍스트 관리자[[commit-context-manager]] - -`commit` 컨텍스트 관리자는 가장 일반적인 네 가지 Git 명령인 pull, add, commit, push를 처리합니다. -`git-lfs`는 10MB보다 큰 파일을 자동으로 추적합니다. -다음 예제에서는 `commit` 컨텍스트 관리자가 다음과 같은 작업을 수행합니다: - -1. `text-files` 리포지토리에서 pull. -2. `file.txt`에 변경 내용을 add. -3. 변경 내용을 commit. -4. 변경 내용을 `text-files` 리포지토리에 push. - -```python ->>> from huggingface_hub import Repository ->>> with Repository(local_dir="text-files", clone_from="/text-files").commit(commit_message="My first file :)"): -... with open("file.txt", "w+") as f: -... f.write(json.dumps({"hey": 8})) -``` - -다음은 `commit` 컨텍스트 관리자를 사용하여 파일을 저장하고 리포지토리에 업로드하는 방법의 또 다른 예입니다: - -```python ->>> import torch ->>> model = torch.nn.Transformer() ->>> with Repository("torch-model", clone_from="/torch-model", token=True).commit(commit_message="My cool model :)"): -... torch.save(model.state_dict(), "model.pt") -``` - -커밋을 비동기적으로 푸시하려면 `blocking=False`를 설정하세요. -커밋을 푸시하는 동안 스크립트를 계속 실행하고 싶을 때 논 블로킹 동작이 유용합니다. - -```python ->>> with repo.commit(commit_message="My cool model :)", blocking=False) -``` - -`command_queue` 메서드로 푸시 상태를 확인할 수 있습니다: - -```python ->>> last_command = repo.command_queue[-1] ->>> last_command.status -``` - -가능한 상태는 아래 표를 참조하세요: - -| 상태 | 설명 | -| -------- | ----------------------------- | -| -1 | 푸시가 진행 중입니다. | -| 0 | 푸시가 성공적으로 완료되었습니다.| -| Non-zero | 오류가 발생했습니다. | - -`blocking=False`인 경우, 명령이 추적되며 스크립트에서 다른 오류가 발생하더라도 모든 푸시가 완료된 경우에만 스크립트가 종료됩니다. -푸시 상태를 확인하는 데 유용한 몇 가지 추가 명령은 다음과 같습니다: - -```python -# 오류를 검사합니다. ->>> last_command.stderr - -# 푸시 진행여부를 확인합니다. ->>> last_command.is_done - -# 푸시 명령의 에러여부를 파악합니다. ->>> last_command.failed -``` - -### push_to_hub[[pushtohub]] - -[`Repository`] 클래스에는 파일을 추가하고 커밋한 후 리포지토리로 푸시하는 [`~Repository.push_to_hub`] 함수가 있습니다. [`~Repository.push_to_hub`]는 `commit` 컨텍스트 관리자와는 달리 호출하기 전에 먼저 리포지토리에서 업데이트(pull) 작업을 수행 해야 합니다. - -예를 들어 Hub에서 리포지토리를 이미 복제했다면 로컬 디렉터리에서 `repo`를 초기화할 수 있습니다: - -```python ->>> from huggingface_hub import Repository ->>> repo = Repository(local_dir="path/to/local/repo") -``` - -로컬 클론을 [`~Repository.git_pull`]로 업데이트한 다음 파일을 Hub로 푸시합니다: - -```py ->>> repo.git_pull() ->>> repo.push_to_hub(commit_message="Commit my-awesome-file to the Hub") -``` - -그러나 아직 파일을 푸시할 준비가 되지 않았다면 [`~Repository.git_add`] 와 [`~Repository.git_commit`]을 사용하여 파일만 추가하고 커밋할 수 있습니다: - -```py ->>> repo.git_add("path/to/file") ->>> repo.git_commit(commit_message="add my first model config file :)") -``` - -준비가 완료되면 [`~Repository.git_push`]를 사용하여 파일을 리포지토리에 푸시합니다: - -```py ->>> repo.git_push() -``` diff --git a/docs/source/ko/package_reference/repository.md b/docs/source/ko/package_reference/repository.md deleted file mode 100644 index fc70e3e203..0000000000 --- a/docs/source/ko/package_reference/repository.md +++ /dev/null @@ -1,49 +0,0 @@ - - -# 로컬 및 온라인 리포지토리 관리[[managing-local-and-online-repositories]] - -`Repository` 클래스는 `git` 및 `git-lfs` 명령을 감싸는 도우미 클래스로, 매우 큰 리포지토리를 관리하는 데 적합한 툴링을 제공합니다. - -`git` 작업이 포함되거나 리포지토리에서의 협업이 중점이 될 때 권장되는 도구입니다. - -## 리포지토리 클래스[[the-repository-class]] - -[[autodoc]] Repository - - __init__ - - current_branch - - all - -## 도우미 메소드[[helper-methods]] - -[[autodoc]] huggingface_hub.repository.is_git_repo - -[[autodoc]] huggingface_hub.repository.is_local_clone - -[[autodoc]] huggingface_hub.repository.is_tracked_with_lfs - -[[autodoc]] huggingface_hub.repository.is_git_ignored - -[[autodoc]] huggingface_hub.repository.files_to_be_staged - -[[autodoc]] huggingface_hub.repository.is_tracked_upstream - -[[autodoc]] huggingface_hub.repository.commits_to_push - -## 후속 비동기 명령[[following-asynchronous-commands]] - -`Repository` 유틸리티는 비동기적으로 시작할 수 있는 여러 메소드를 제공합니다. -- `git_push` -- `git_pull` -- `push_to_hub` -- `commit` 컨텍스트 관리자 - -이러한 비동기 메소드를 관리하는 유틸리티는 아래를 참조하세요. - -[[autodoc]] Repository - - commands_failed - - commands_in_progress - - wait_for_commands - -[[autodoc]] huggingface_hub.repository.CommandInProgress diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index b534281a5e..13ecb24cf2 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -492,9 +492,6 @@ "ModelCardData", "SpaceCardData", ], - "repository": [ - "Repository", - ], "serialization": [ "StateDictSplit", "get_tf_storage_size", @@ -715,7 +712,6 @@ "REPO_TYPE_SPACE", "RepoCard", "RepoUrl", - "Repository", "SentenceSimilarityInput", "SentenceSimilarityInputData", "SpaceCard", @@ -1512,7 +1508,6 @@ def __dir__(): ModelCardData, # noqa: F401 SpaceCardData, # noqa: F401 ) - from .repository import Repository # noqa: F401 from .serialization import ( StateDictSplit, # noqa: F401 get_tf_storage_size, # noqa: F401 diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py deleted file mode 100644 index 387761cedc..0000000000 --- a/src/huggingface_hub/repository.py +++ /dev/null @@ -1,1477 +0,0 @@ -import atexit -import os -import re -import subprocess -import threading -import time -from contextlib import contextmanager -from pathlib import Path -from typing import Callable, Iterator, Optional, TypedDict, Union -from urllib.parse import urlparse - -from huggingface_hub import constants -from huggingface_hub.repocard import metadata_load, metadata_save - -from .hf_api import HfApi, repo_type_and_id_from_hf_id -from .lfs import LFS_MULTIPART_UPLOAD_COMMAND -from .utils import ( - SoftTemporaryDirectory, - get_token, - logging, - run_subprocess, - tqdm, - validate_hf_hub_args, -) -from .utils._deprecation import _deprecate_method - - -logger = logging.get_logger(__name__) - - -class CommandInProgress: - """ - Utility to follow commands launched asynchronously. - """ - - def __init__( - self, - title: str, - is_done_method: Callable, - status_method: Callable, - process: subprocess.Popen, - post_method: Optional[Callable] = None, - ): - self.title = title - self._is_done = is_done_method - self._status = status_method - self._process = process - self._stderr = "" - self._stdout = "" - self._post_method = post_method - - @property - def is_done(self) -> bool: - """ - Whether the process is done. - """ - result = self._is_done() - - if result and self._post_method is not None: - self._post_method() - self._post_method = None - - return result - - @property - def status(self) -> int: - """ - The exit code/status of the current action. Will return `0` if the - command has completed successfully, and a number between 1 and 255 if - the process errored-out. - - Will return -1 if the command is still ongoing. - """ - return self._status() - - @property - def failed(self) -> bool: - """ - Whether the process errored-out. - """ - return self.status > 0 - - @property - def stderr(self) -> str: - """ - The current output message on the standard error. - """ - if self._process.stderr is not None: - self._stderr += self._process.stderr.read() - return self._stderr - - @property - def stdout(self) -> str: - """ - The current output message on the standard output. - """ - if self._process.stdout is not None: - self._stdout += self._process.stdout.read() - return self._stdout - - def __repr__(self): - status = self.status - - if status == -1: - status = "running" - - return ( - f"[{self.title} command, status code: {status}," - f" {'in progress.' if not self.is_done else 'finished.'} PID:" - f" {self._process.pid}]" - ) - - -def is_git_repo(folder: Union[str, Path]) -> bool: - """ - Check if the folder is the root or part of a git repository - - Args: - folder (`str`): - The folder in which to run the command. - - Returns: - `bool`: `True` if the repository is part of a repository, `False` - otherwise. - """ - folder_exists = os.path.exists(os.path.join(folder, ".git")) - git_branch = subprocess.run("git branch".split(), cwd=folder, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return folder_exists and git_branch.returncode == 0 - - -def is_local_clone(folder: Union[str, Path], remote_url: str) -> bool: - """ - Check if the folder is a local clone of the remote_url - - Args: - folder (`str` or `Path`): - The folder in which to run the command. - remote_url (`str`): - The url of a git repository. - - Returns: - `bool`: `True` if the repository is a local clone of the remote - repository specified, `False` otherwise. - """ - if not is_git_repo(folder): - return False - - remotes = run_subprocess("git remote -v", folder).stdout - - # Remove token for the test with remotes. - remote_url = re.sub(r"https://.*@", "https://", remote_url) - remotes = [re.sub(r"https://.*@", "https://", remote) for remote in remotes.split()] - return remote_url in remotes - - -def is_tracked_with_lfs(filename: Union[str, Path]) -> bool: - """ - Check if the file passed is tracked with git-lfs. - - Args: - filename (`str` or `Path`): - The filename to check. - - Returns: - `bool`: `True` if the file passed is tracked with git-lfs, `False` - otherwise. - """ - folder = Path(filename).parent - filename = Path(filename).name - - try: - p = run_subprocess("git check-attr -a".split() + [filename], folder) - attributes = p.stdout.strip() - except subprocess.CalledProcessError as exc: - if not is_git_repo(folder): - return False - else: - raise OSError(exc.stderr) - - if len(attributes) == 0: - return False - - found_lfs_tag = {"diff": False, "merge": False, "filter": False} - - for attribute in attributes.split("\n"): - for tag in found_lfs_tag.keys(): - if tag in attribute and "lfs" in attribute: - found_lfs_tag[tag] = True - - return all(found_lfs_tag.values()) - - -def is_git_ignored(filename: Union[str, Path]) -> bool: - """ - Check if file is git-ignored. Supports nested .gitignore files. - - Args: - filename (`str` or `Path`): - The filename to check. - - Returns: - `bool`: `True` if the file passed is ignored by `git`, `False` - otherwise. - """ - folder = Path(filename).parent - filename = Path(filename).name - - try: - p = run_subprocess("git check-ignore".split() + [filename], folder, check=False) - # Will return exit code 1 if not gitignored - is_ignored = not bool(p.returncode) - except subprocess.CalledProcessError as exc: - raise OSError(exc.stderr) - - return is_ignored - - -def is_binary_file(filename: Union[str, Path]) -> bool: - """ - Check if file is a binary file. - - Args: - filename (`str` or `Path`): - The filename to check. - - Returns: - `bool`: `True` if the file passed is a binary file, `False` otherwise. - """ - try: - with open(filename, "rb") as f: - content = f.read(10 * (1024**2)) # Read a maximum of 10MB - - # Code sample taken from the following stack overflow thread - # https://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python/7392391#7392391 - text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F}) - return bool(content.translate(None, text_chars)) - except UnicodeDecodeError: - return True - - -def files_to_be_staged(pattern: str = ".", folder: Union[str, Path, None] = None) -> list[str]: - """ - Returns a list of filenames that are to be staged. - - Args: - pattern (`str` or `Path`): - The pattern of filenames to check. Put `.` to get all files. - folder (`str` or `Path`): - The folder in which to run the command. - - Returns: - `list[str]`: List of files that are to be staged. - """ - try: - p = run_subprocess("git ls-files --exclude-standard -mo".split() + [pattern], folder) - if len(p.stdout.strip()): - files = p.stdout.strip().split("\n") - else: - files = [] - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - return files - - -def is_tracked_upstream(folder: Union[str, Path]) -> bool: - """ - Check if the current checked-out branch is tracked upstream. - - Args: - folder (`str` or `Path`): - The folder in which to run the command. - - Returns: - `bool`: `True` if the current checked-out branch is tracked upstream, - `False` otherwise. - """ - try: - run_subprocess("git rev-parse --symbolic-full-name --abbrev-ref @{u}", folder) - return True - except subprocess.CalledProcessError as exc: - if "HEAD" in exc.stderr: - raise OSError("No branch checked out") - - return False - - -def commits_to_push(folder: Union[str, Path], upstream: Optional[str] = None) -> int: - """ - Check the number of commits that would be pushed upstream - - Args: - folder (`str` or `Path`): - The folder in which to run the command. - upstream (`str`, *optional*): - The name of the upstream repository with which the comparison should be - made. - - Returns: - `int`: Number of commits that would be pushed upstream were a `git - push` to proceed. - """ - try: - result = run_subprocess(f"git cherry -v {upstream or ''}", folder) - return len(result.stdout.split("\n")) - 1 - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - -class PbarT(TypedDict): - # Used to store an opened progress bar in `_lfs_log_progress` - bar: tqdm - past_bytes: int - - -@contextmanager -def _lfs_log_progress(): - """ - This is a context manager that will log the Git LFS progress of cleaning, - smudging, pulling and pushing. - """ - - if logger.getEffectiveLevel() >= logging.ERROR: - try: - yield - except Exception: - pass - return - - def output_progress(stopping_event: threading.Event): - """ - To be launched as a separate thread with an event meaning it should stop - the tail. - """ - # Key is tuple(state, filename), value is a dict(tqdm bar and a previous value) - pbars: dict[tuple[str, str], PbarT] = {} - - def close_pbars(): - for pbar in pbars.values(): - pbar["bar"].update(pbar["bar"].total - pbar["past_bytes"]) - pbar["bar"].refresh() - pbar["bar"].close() - - def tail_file(filename) -> Iterator[str]: - """ - Creates a generator to be iterated through, which will return each - line one by one. Will stop tailing the file if the stopping_event is - set. - """ - with open(filename, "r") as file: - current_line = "" - while True: - if stopping_event.is_set(): - close_pbars() - break - - line_bit = file.readline() - if line_bit is not None and not len(line_bit.strip()) == 0: - current_line += line_bit - if current_line.endswith("\n"): - yield current_line - current_line = "" - else: - time.sleep(1) - - # If the file isn't created yet, wait for a few seconds before trying again. - # Can be interrupted with the stopping_event. - while not os.path.exists(os.environ["GIT_LFS_PROGRESS"]): - if stopping_event.is_set(): - close_pbars() - return - - time.sleep(2) - - for line in tail_file(os.environ["GIT_LFS_PROGRESS"]): - try: - state, file_progress, byte_progress, filename = line.split() - except ValueError as error: - # Try/except to ease debugging. See https://github.com/huggingface/huggingface_hub/issues/1373. - raise ValueError(f"Cannot unpack LFS progress line:\n{line}") from error - description = f"{state.capitalize()} file {filename}" - - current_bytes, total_bytes = byte_progress.split("/") - current_bytes_int = int(current_bytes) - total_bytes_int = int(total_bytes) - - pbar = pbars.get((state, filename)) - if pbar is None: - # Initialize progress bar - pbars[(state, filename)] = { - "bar": tqdm( - desc=description, - initial=current_bytes_int, - total=total_bytes_int, - unit="B", - unit_scale=True, - unit_divisor=1024, - name="huggingface_hub.lfs_upload", - ), - "past_bytes": int(current_bytes), - } - else: - # Update progress bar - pbar["bar"].update(current_bytes_int - pbar["past_bytes"]) - pbar["past_bytes"] = current_bytes_int - - current_lfs_progress_value = os.environ.get("GIT_LFS_PROGRESS", "") - - with SoftTemporaryDirectory() as tmpdir: - os.environ["GIT_LFS_PROGRESS"] = os.path.join(tmpdir, "lfs_progress") - logger.debug(f"Following progress in {os.environ['GIT_LFS_PROGRESS']}") - - exit_event = threading.Event() - x = threading.Thread(target=output_progress, args=(exit_event,), daemon=True) - x.start() - - try: - yield - finally: - exit_event.set() - x.join() - - os.environ["GIT_LFS_PROGRESS"] = current_lfs_progress_value - - -class Repository: - """ - Helper class to wrap the git and git-lfs commands. - - The aim is to facilitate interacting with huggingface.co hosted model or - dataset repos, though not a lot here (if any) is actually specific to - huggingface.co. - - - - [`Repository`] is deprecated in favor of the http-based alternatives implemented in - [`HfApi`]. Given its large adoption in legacy code, the complete removal of - [`Repository`] will only happen in release `v1.0`. For more details, please read - https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http. - - - """ - - command_queue: list[CommandInProgress] - - @validate_hf_hub_args - @_deprecate_method( - version="1.0", - message=( - "Please prefer the http-based alternatives instead. Given its large adoption in legacy code, the complete" - " removal is only planned on next major release.\nFor more details, please read" - " https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http." - ), - ) - def __init__( - self, - local_dir: Union[str, Path], - clone_from: Optional[str] = None, - repo_type: Optional[str] = None, - token: Union[bool, str] = True, - git_user: Optional[str] = None, - git_email: Optional[str] = None, - revision: Optional[str] = None, - skip_lfs_files: bool = False, - client: Optional[HfApi] = None, - ): - """ - Instantiate a local clone of a git repo. - - If `clone_from` is set, the repo will be cloned from an existing remote repository. - If the remote repo does not exist, a `EnvironmentError` exception will be thrown. - Please create the remote repo first using [`create_repo`]. - - `Repository` uses the local git credentials by default. If explicitly set, the `token` - or the `git_user`/`git_email` pair will be used instead. - - Args: - local_dir (`str` or `Path`): - path (e.g. `'my_trained_model/'`) to the local directory, where - the `Repository` will be initialized. - clone_from (`str`, *optional*): - Either a repository url or `repo_id`. - Example: - - `"https://huggingface.co/philschmid/playground-tests"` - - `"philschmid/playground-tests"` - repo_type (`str`, *optional*): - To set when cloning a repo from a repo_id. Default is model. - token (`bool` or `str`, *optional*): - A valid authentication token (see https://huggingface.co/settings/token). - If `None` or `True` and machine is logged in (through `hf auth login` - or [`~huggingface_hub.login`]), token will be retrieved from the cache. - If `False`, token is not sent in the request header. - git_user (`str`, *optional*): - will override the `git config user.name` for committing and - pushing files to the hub. - git_email (`str`, *optional*): - will override the `git config user.email` for committing and - pushing files to the hub. - revision (`str`, *optional*): - Revision to checkout after initializing the repository. If the - revision doesn't exist, a branch will be created with that - revision name from the default branch's current HEAD. - skip_lfs_files (`bool`, *optional*, defaults to `False`): - whether to skip git-LFS files or not. - client (`HfApi`, *optional*): - Instance of [`HfApi`] to use when calling the HF Hub API. A new - instance will be created if this is left to `None`. - - Raises: - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) - If the remote repository set in `clone_from` does not exist. - """ - if isinstance(local_dir, Path): - local_dir = str(local_dir) - os.makedirs(local_dir, exist_ok=True) - self.local_dir = os.path.join(os.getcwd(), local_dir) - self._repo_type = repo_type - self.command_queue = [] - self.skip_lfs_files = skip_lfs_files - self.client = client if client is not None else HfApi() - - self.check_git_versions() - - if isinstance(token, str): - self.huggingface_token: Optional[str] = token - elif token is False: - self.huggingface_token = None - else: - # if `True` -> explicit use of the cached token - # if `None` -> implicit use of the cached token - self.huggingface_token = get_token() - - if clone_from is not None: - self.clone_from(repo_url=clone_from) - else: - if is_git_repo(self.local_dir): - logger.debug("[Repository] is a valid git repo") - else: - raise ValueError("If not specifying `clone_from`, you need to pass Repository a valid git clone.") - - if self.huggingface_token is not None and (git_email is None or git_user is None): - user = self.client.whoami(self.huggingface_token) - - if git_email is None: - git_email = user.get("email") - - if git_user is None: - git_user = user.get("fullname") - - if git_user is not None or git_email is not None: - self.git_config_username_and_email(git_user, git_email) - - self.lfs_enable_largefiles() - self.git_credential_helper_store() - - if revision is not None: - self.git_checkout(revision, create_branch_ok=True) - - # This ensures that all commands exit before exiting the Python runtime. - # This will ensure all pushes register on the hub, even if other errors happen in subsequent operations. - atexit.register(self.wait_for_commands) - - @property - def current_branch(self) -> str: - """ - Returns the current checked out branch. - - Returns: - `str`: Current checked out branch. - """ - try: - result = run_subprocess("git rev-parse --abbrev-ref HEAD", self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - return result - - def check_git_versions(self): - """ - Checks that `git` and `git-lfs` can be run. - - Raises: - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) - If `git` or `git-lfs` are not installed. - """ - try: - git_version = run_subprocess("git --version", self.local_dir).stdout.strip() - except FileNotFoundError: - raise EnvironmentError("Looks like you do not have git installed, please install.") - - try: - lfs_version = run_subprocess("git-lfs --version", self.local_dir).stdout.strip() - except FileNotFoundError: - raise EnvironmentError( - "Looks like you do not have git-lfs installed, please install." - " You can install from https://git-lfs.github.com/." - " Then run `git lfs install` (you only have to do this once)." - ) - logger.info(git_version + "\n" + lfs_version) - - @validate_hf_hub_args - def clone_from(self, repo_url: str, token: Union[bool, str, None] = None): - """ - Clone from a remote. If the folder already exists, will try to clone the - repository within it. - - If this folder is a git repository with linked history, will try to - update the repository. - - Args: - repo_url (`str`): - The URL from which to clone the repository - token (`Union[str, bool]`, *optional*): - Whether to use the authentication token. It can be: - - a string which is the token itself - - `False`, which would not use the authentication token - - `True`, which would fetch the authentication token from the - local folder and use it (you should be logged in for this to - work). - - `None`, which would retrieve the value of - `self.huggingface_token`. - - - - Raises the following error: - - - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) - if an organization token (starts with "api_org") is passed. Use must use - your own personal access token (see https://hf.co/settings/tokens). - - - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) - if you are trying to clone the repository in a non-empty folder, or if the - `git` operations raise errors. - - - """ - token = ( - token # str -> use it - if isinstance(token, str) - else ( - None # `False` -> explicit no token - if token is False - else self.huggingface_token # `None` or `True` -> use default - ) - ) - if token is not None and token.startswith("api_org"): - raise ValueError( - "You must use your personal access token, not an Organization token" - " (see https://hf.co/settings/tokens)." - ) - - hub_url = self.client.endpoint - if hub_url in repo_url or ("http" not in repo_url and len(repo_url.split("/")) <= 2): - repo_type, namespace, repo_name = repo_type_and_id_from_hf_id(repo_url, hub_url=hub_url) - repo_id = f"{namespace}/{repo_name}" if namespace is not None else repo_name - - if repo_type is not None: - self._repo_type = repo_type - - repo_url = hub_url + "/" - - if self._repo_type in constants.REPO_TYPES_URL_PREFIXES: - repo_url += constants.REPO_TYPES_URL_PREFIXES[self._repo_type] - - if token is not None: - # Add token in git url when provided - scheme = urlparse(repo_url).scheme - repo_url = repo_url.replace(f"{scheme}://", f"{scheme}://user:{token}@") - - repo_url += repo_id - - # For error messages, it's cleaner to show the repo url without the token. - clean_repo_url = re.sub(r"(https?)://.*@", r"\1://", repo_url) - try: - run_subprocess("git lfs install", self.local_dir) - - # checks if repository is initialized in a empty repository or in one with files - if len(os.listdir(self.local_dir)) == 0: - logger.warning(f"Cloning {clean_repo_url} into local empty directory.") - - with _lfs_log_progress(): - env = os.environ.copy() - - if self.skip_lfs_files: - env.update({"GIT_LFS_SKIP_SMUDGE": "1"}) - - run_subprocess( - # 'git lfs clone' is deprecated (will display a warning in the terminal) - # but we still use it as it provides a nicer UX when downloading large - # files (shows progress). - f"{'git clone' if self.skip_lfs_files else 'git lfs clone'} {repo_url} .", - self.local_dir, - env=env, - ) - else: - # Check if the folder is the root of a git repository - if not is_git_repo(self.local_dir): - raise EnvironmentError( - "Tried to clone a repository in a non-empty folder that isn't" - f" a git repository ('{self.local_dir}'). If you really want to" - f" do this, do it manually:\n cd {self.local_dir} && git init" - " && git remote add origin && git pull origin main\n or clone" - " repo to a new folder and move your existing files there" - " afterwards." - ) - - if is_local_clone(self.local_dir, repo_url): - logger.warning( - f"{self.local_dir} is already a clone of {clean_repo_url}." - " Make sure you pull the latest changes with" - " `repo.git_pull()`." - ) - else: - output = run_subprocess("git remote get-url origin", self.local_dir, check=False) - - error_msg = ( - f"Tried to clone {clean_repo_url} in an unrelated git" - " repository.\nIf you believe this is an error, please add" - f" a remote with the following URL: {clean_repo_url}." - ) - if output.returncode == 0: - clean_local_remote_url = re.sub(r"https://.*@", "https://", output.stdout) - error_msg += f"\nLocal path has its origin defined as: {clean_local_remote_url}" - raise EnvironmentError(error_msg) - - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_config_username_and_email(self, git_user: Optional[str] = None, git_email: Optional[str] = None): - """ - Sets git username and email (only in the current repo). - - Args: - git_user (`str`, *optional*): - The username to register through `git`. - git_email (`str`, *optional*): - The email to register through `git`. - """ - try: - if git_user is not None: - run_subprocess("git config user.name".split() + [git_user], self.local_dir) - - if git_email is not None: - run_subprocess(f"git config user.email {git_email}".split(), self.local_dir) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_credential_helper_store(self): - """ - Sets the git credential helper to `store` - """ - try: - run_subprocess("git config credential.helper store", self.local_dir) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_head_hash(self) -> str: - """ - Get commit sha on top of HEAD. - - Returns: - `str`: The current checked out commit SHA. - """ - try: - p = run_subprocess("git rev-parse HEAD", self.local_dir) - return p.stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_remote_url(self) -> str: - """ - Get URL to origin remote. - - Returns: - `str`: The URL of the `origin` remote. - """ - try: - p = run_subprocess("git config --get remote.origin.url", self.local_dir) - url = p.stdout.strip() - # Strip basic auth info. - return re.sub(r"https://.*@", "https://", url) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_head_commit_url(self) -> str: - """ - Get URL to last commit on HEAD. We assume it's been pushed, and the url - scheme is the same one as for GitHub or HuggingFace. - - Returns: - `str`: The URL to the current checked-out commit. - """ - sha = self.git_head_hash() - url = self.git_remote_url() - if url.endswith("/"): - url = url[:-1] - return f"{url}/commit/{sha}" - - def list_deleted_files(self) -> list[str]: - """ - Returns a list of the files that are deleted in the working directory or - index. - - Returns: - `list[str]`: A list of files that have been deleted in the working - directory or index. - """ - try: - git_status = run_subprocess("git status -s", self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - if len(git_status) == 0: - return [] - - # Receives a status like the following - # D .gitignore - # D new_file.json - # AD new_file1.json - # ?? new_file2.json - # ?? new_file4.json - - # Strip each line of whitespaces - modified_files_statuses = [status.strip() for status in git_status.split("\n")] - - # Only keep files that are deleted using the D prefix - deleted_files_statuses = [status for status in modified_files_statuses if "D" in status.split()[0]] - - # Remove the D prefix and strip to keep only the relevant filename - deleted_files = [status.split()[-1].strip() for status in deleted_files_statuses] - - return deleted_files - - def lfs_track(self, patterns: Union[str, list[str]], filename: bool = False): - """ - Tell git-lfs to track files according to a pattern. - - Setting the `filename` argument to `True` will treat the arguments as - literal filenames, not as patterns. Any special glob characters in the - filename will be escaped when writing to the `.gitattributes` file. - - Args: - patterns (`Union[str, list[str]]`): - The pattern, or list of patterns, to track with git-lfs. - filename (`bool`, *optional*, defaults to `False`): - Whether to use the patterns as literal filenames. - """ - if isinstance(patterns, str): - patterns = [patterns] - try: - for pattern in patterns: - run_subprocess( - f"git lfs track {'--filename' if filename else ''} {pattern}", - self.local_dir, - ) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def lfs_untrack(self, patterns: Union[str, list[str]]): - """ - Tell git-lfs to untrack those files. - - Args: - patterns (`Union[str, list[str]]`): - The pattern, or list of patterns, to untrack with git-lfs. - """ - if isinstance(patterns, str): - patterns = [patterns] - try: - for pattern in patterns: - run_subprocess("git lfs untrack".split() + [pattern], self.local_dir) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def lfs_enable_largefiles(self): - """ - HF-specific. This enables upload support of files >5GB. - """ - try: - lfs_config = "git config lfs.customtransfer.multipart" - run_subprocess(f"{lfs_config}.path hf", self.local_dir) - run_subprocess( - f"{lfs_config}.args {LFS_MULTIPART_UPLOAD_COMMAND}", - self.local_dir, - ) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def auto_track_binary_files(self, pattern: str = ".") -> list[str]: - """ - Automatically track binary files with git-lfs. - - Args: - pattern (`str`, *optional*, defaults to "."): - The pattern with which to track files that are binary. - - Returns: - `list[str]`: List of filenames that are now tracked due to being - binary files - """ - files_to_be_tracked_with_lfs = [] - - deleted_files = self.list_deleted_files() - - for filename in files_to_be_staged(pattern, folder=self.local_dir): - if filename in deleted_files: - continue - - path_to_file = os.path.join(os.getcwd(), self.local_dir, filename) - - if not (is_tracked_with_lfs(path_to_file) or is_git_ignored(path_to_file)): - size_in_mb = os.path.getsize(path_to_file) / (1024 * 1024) - - if size_in_mb >= 10: - logger.warning( - "Parsing a large file to check if binary or not. Tracking large" - " files using `repository.auto_track_large_files` is" - " recommended so as to not load the full file in memory." - ) - - is_binary = is_binary_file(path_to_file) - - if is_binary: - self.lfs_track(filename) - files_to_be_tracked_with_lfs.append(filename) - - # Cleanup the .gitattributes if files were deleted - self.lfs_untrack(deleted_files) - - return files_to_be_tracked_with_lfs - - def auto_track_large_files(self, pattern: str = ".") -> list[str]: - """ - Automatically track large files (files that weigh more than 10MBs) with - git-lfs. - - Args: - pattern (`str`, *optional*, defaults to "."): - The pattern with which to track files that are above 10MBs. - - Returns: - `list[str]`: List of filenames that are now tracked due to their - size. - """ - files_to_be_tracked_with_lfs = [] - - deleted_files = self.list_deleted_files() - - for filename in files_to_be_staged(pattern, folder=self.local_dir): - if filename in deleted_files: - continue - - path_to_file = os.path.join(os.getcwd(), self.local_dir, filename) - size_in_mb = os.path.getsize(path_to_file) / (1024 * 1024) - - if size_in_mb >= 10 and not is_tracked_with_lfs(path_to_file) and not is_git_ignored(path_to_file): - self.lfs_track(filename) - files_to_be_tracked_with_lfs.append(filename) - - # Cleanup the .gitattributes if files were deleted - self.lfs_untrack(deleted_files) - - return files_to_be_tracked_with_lfs - - def lfs_prune(self, recent=False): - """ - git lfs prune - - Args: - recent (`bool`, *optional*, defaults to `False`): - Whether to prune files even if they were referenced by recent - commits. See the following - [link](https://github.com/git-lfs/git-lfs/blob/f3d43f0428a84fc4f1e5405b76b5a73ec2437e65/docs/man/git-lfs-prune.1.ronn#recent-files) - for more information. - """ - try: - with _lfs_log_progress(): - result = run_subprocess(f"git lfs prune {'--recent' if recent else ''}", self.local_dir) - logger.info(result.stdout) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_pull(self, rebase: bool = False, lfs: bool = False): - """ - git pull - - Args: - rebase (`bool`, *optional*, defaults to `False`): - Whether to rebase the current branch on top of the upstream - branch after fetching. - lfs (`bool`, *optional*, defaults to `False`): - Whether to fetch the LFS files too. This option only changes the - behavior when a repository was cloned without fetching the LFS - files; calling `repo.git_pull(lfs=True)` will then fetch the LFS - file from the remote repository. - """ - command = "git pull" if not lfs else "git lfs pull" - if rebase: - command += " --rebase" - try: - with _lfs_log_progress(): - result = run_subprocess(command, self.local_dir) - logger.info(result.stdout) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_add(self, pattern: str = ".", auto_lfs_track: bool = False): - """ - git add - - Setting the `auto_lfs_track` parameter to `True` will automatically - track files that are larger than 10MB with `git-lfs`. - - Args: - pattern (`str`, *optional*, defaults to "."): - The pattern with which to add files to staging. - auto_lfs_track (`bool`, *optional*, defaults to `False`): - Whether to automatically track large and binary files with - git-lfs. Any file over 10MB in size, or in binary format, will - be automatically tracked. - """ - if auto_lfs_track: - # Track files according to their size (>=10MB) - tracked_files = self.auto_track_large_files(pattern) - - # Read the remaining files and track them if they're binary - tracked_files.extend(self.auto_track_binary_files(pattern)) - - if tracked_files: - logger.warning( - f"Adding files tracked by Git LFS: {tracked_files}. This may take a" - " bit of time if the files are large." - ) - - try: - result = run_subprocess("git add -v".split() + [pattern], self.local_dir) - logger.info(f"Adding to index:\n{result.stdout}\n") - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def git_commit(self, commit_message: str = "commit files to HF hub"): - """ - git commit - - Args: - commit_message (`str`, *optional*, defaults to "commit files to HF hub"): - The message attributed to the commit. - """ - try: - result = run_subprocess("git commit -v -m".split() + [commit_message], self.local_dir) - logger.info(f"Committed:\n{result.stdout}\n") - except subprocess.CalledProcessError as exc: - if len(exc.stderr) > 0: - raise EnvironmentError(exc.stderr) - else: - raise EnvironmentError(exc.stdout) - - def git_push( - self, - upstream: Optional[str] = None, - blocking: bool = True, - auto_lfs_prune: bool = False, - ) -> Union[str, tuple[str, CommandInProgress]]: - """ - git push - - If used without setting `blocking`, will return url to commit on remote - repo. If used with `blocking=True`, will return a tuple containing the - url to commit and the command object to follow for information about the - process. - - Args: - upstream (`str`, *optional*): - Upstream to which this should push. If not specified, will push - to the lastly defined upstream or to the default one (`origin - main`). - blocking (`bool`, *optional*, defaults to `True`): - Whether the function should return only when the push has - finished. Setting this to `False` will return an - `CommandInProgress` object which has an `is_done` property. This - property will be set to `True` when the push is finished. - auto_lfs_prune (`bool`, *optional*, defaults to `False`): - Whether to automatically prune files once they have been pushed - to the remote. - """ - command = "git push" - - if upstream: - command += f" --set-upstream {upstream}" - - number_of_commits = commits_to_push(self.local_dir, upstream) - - if number_of_commits > 1: - logger.warning(f"Several commits ({number_of_commits}) will be pushed upstream.") - if blocking: - logger.warning("The progress bars may be unreliable.") - - try: - with _lfs_log_progress(): - process = subprocess.Popen( - command.split(), - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, - encoding="utf-8", - cwd=self.local_dir, - ) - - if blocking: - stdout, stderr = process.communicate() - return_code = process.poll() - process.kill() - - if len(stderr): - logger.warning(stderr) - - if return_code: - raise subprocess.CalledProcessError(return_code, process.args, output=stdout, stderr=stderr) - - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - if not blocking: - - def status_method(): - status = process.poll() - if status is None: - return -1 - else: - return status - - command_in_progress = CommandInProgress( - "push", - is_done_method=lambda: process.poll() is not None, - status_method=status_method, - process=process, - post_method=self.lfs_prune if auto_lfs_prune else None, - ) - - self.command_queue.append(command_in_progress) - - return self.git_head_commit_url(), command_in_progress - - if auto_lfs_prune: - self.lfs_prune() - - return self.git_head_commit_url() - - def git_checkout(self, revision: str, create_branch_ok: bool = False): - """ - git checkout a given revision - - Specifying `create_branch_ok` to `True` will create the branch to the - given revision if that revision doesn't exist. - - Args: - revision (`str`): - The revision to checkout. - create_branch_ok (`str`, *optional*, defaults to `False`): - Whether creating a branch named with the `revision` passed at - the current checked-out reference if `revision` isn't an - existing revision is allowed. - """ - try: - result = run_subprocess(f"git checkout {revision}", self.local_dir) - logger.warning(f"Checked out {revision} from {self.current_branch}.") - logger.warning(result.stdout) - except subprocess.CalledProcessError as exc: - if not create_branch_ok: - raise EnvironmentError(exc.stderr) - else: - try: - result = run_subprocess(f"git checkout -b {revision}", self.local_dir) - logger.warning( - f"Revision `{revision}` does not exist. Created and checked out branch `{revision}`." - ) - logger.warning(result.stdout) - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def tag_exists(self, tag_name: str, remote: Optional[str] = None) -> bool: - """ - Check if a tag exists or not. - - Args: - tag_name (`str`): - The name of the tag to check. - remote (`str`, *optional*): - Whether to check if the tag exists on a remote. This parameter - should be the identifier of the remote. - - Returns: - `bool`: Whether the tag exists. - """ - if remote: - try: - result = run_subprocess(f"git ls-remote origin refs/tags/{tag_name}", self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - return len(result) != 0 - else: - try: - git_tags = run_subprocess("git tag", self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - git_tags = git_tags.split("\n") - return tag_name in git_tags - - def delete_tag(self, tag_name: str, remote: Optional[str] = None) -> bool: - """ - Delete a tag, both local and remote, if it exists - - Args: - tag_name (`str`): - The tag name to delete. - remote (`str`, *optional*): - The remote on which to delete the tag. - - Returns: - `bool`: `True` if deleted, `False` if the tag didn't exist. - If remote is not passed, will just be updated locally - """ - delete_locally = True - delete_remotely = True - - if not self.tag_exists(tag_name): - delete_locally = False - - if not self.tag_exists(tag_name, remote=remote): - delete_remotely = False - - if delete_locally: - try: - run_subprocess(["git", "tag", "-d", tag_name], self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - if remote and delete_remotely: - try: - run_subprocess(f"git push {remote} --delete {tag_name}", self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - return True - - def add_tag(self, tag_name: str, message: Optional[str] = None, remote: Optional[str] = None): - """ - Add a tag at the current head and push it - - If remote is None, will just be updated locally - - If no message is provided, the tag will be lightweight. if a message is - provided, the tag will be annotated. - - Args: - tag_name (`str`): - The name of the tag to be added. - message (`str`, *optional*): - The message that accompanies the tag. The tag will turn into an - annotated tag if a message is passed. - remote (`str`, *optional*): - The remote on which to add the tag. - """ - if message: - tag_args = ["git", "tag", "-a", tag_name, "-m", message] - else: - tag_args = ["git", "tag", tag_name] - - try: - run_subprocess(tag_args, self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - if remote: - try: - run_subprocess(f"git push {remote} {tag_name}", self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - def is_repo_clean(self) -> bool: - """ - Return whether or not the git status is clean or not - - Returns: - `bool`: `True` if the git status is clean, `False` otherwise. - """ - try: - git_status = run_subprocess("git status --porcelain", self.local_dir).stdout.strip() - except subprocess.CalledProcessError as exc: - raise EnvironmentError(exc.stderr) - - return len(git_status) == 0 - - def push_to_hub( - self, - commit_message: str = "commit files to HF hub", - blocking: bool = True, - clean_ok: bool = True, - auto_lfs_prune: bool = False, - ) -> Union[None, str, tuple[str, CommandInProgress]]: - """ - Helper to add, commit, and push files to remote repository on the - HuggingFace Hub. Will automatically track large files (>10MB). - - Args: - commit_message (`str`): - Message to use for the commit. - blocking (`bool`, *optional*, defaults to `True`): - Whether the function should return only when the `git push` has - finished. - clean_ok (`bool`, *optional*, defaults to `True`): - If True, this function will return None if the repo is - untouched. Default behavior is to fail because the git command - fails. - auto_lfs_prune (`bool`, *optional*, defaults to `False`): - Whether to automatically prune files once they have been pushed - to the remote. - """ - if clean_ok and self.is_repo_clean(): - logger.info("Repo currently clean. Ignoring push_to_hub") - return None - self.git_add(auto_lfs_track=True) - self.git_commit(commit_message) - return self.git_push( - upstream=f"origin {self.current_branch}", - blocking=blocking, - auto_lfs_prune=auto_lfs_prune, - ) - - @contextmanager - def commit( - self, - commit_message: str, - branch: Optional[str] = None, - track_large_files: bool = True, - blocking: bool = True, - auto_lfs_prune: bool = False, - ): - """ - Context manager utility to handle committing to a repository. This - automatically tracks large files (>10Mb) with git-lfs. Set the - `track_large_files` argument to `False` if you wish to ignore that - behavior. - - Args: - commit_message (`str`): - Message to use for the commit. - branch (`str`, *optional*): - The branch on which the commit will appear. This branch will be - checked-out before any operation. - track_large_files (`bool`, *optional*, defaults to `True`): - Whether to automatically track large files or not. Will do so by - default. - blocking (`bool`, *optional*, defaults to `True`): - Whether the function should return only when the `git push` has - finished. - auto_lfs_prune (`bool`, defaults to `True`): - Whether to automatically prune files once they have been pushed - to the remote. - - Examples: - - ```python - >>> with Repository( - ... "text-files", - ... clone_from="/text-files", - ... token=True, - >>> ).commit("My first file :)"): - ... with open("file.txt", "w+") as f: - ... f.write(json.dumps({"hey": 8})) - - >>> import torch - - >>> model = torch.nn.Transformer() - >>> with Repository( - ... "torch-model", - ... clone_from="/torch-model", - ... token=True, - >>> ).commit("My cool model :)"): - ... torch.save(model.state_dict(), "model.pt") - ``` - - """ - - files_to_stage = files_to_be_staged(".", folder=self.local_dir) - - if len(files_to_stage): - files_in_msg = str(files_to_stage[:5])[:-1] + ", ...]" if len(files_to_stage) > 5 else str(files_to_stage) - logger.error( - "There exists some updated files in the local repository that are not" - f" committed: {files_in_msg}. This may lead to errors if checking out" - " a branch. These files and their modifications will be added to the" - " current commit." - ) - - if branch is not None: - self.git_checkout(branch, create_branch_ok=True) - - if is_tracked_upstream(self.local_dir): - logger.warning("Pulling changes ...") - self.git_pull(rebase=True) - else: - logger.warning(f"The current branch has no upstream branch. Will push to 'origin {self.current_branch}'") - - current_working_directory = os.getcwd() - os.chdir(os.path.join(current_working_directory, self.local_dir)) - - try: - yield self - finally: - self.git_add(auto_lfs_track=track_large_files) - - try: - self.git_commit(commit_message) - except OSError as e: - # If no changes are detected, there is nothing to commit. - if "nothing to commit" not in str(e): - raise e - - try: - self.git_push( - upstream=f"origin {self.current_branch}", - blocking=blocking, - auto_lfs_prune=auto_lfs_prune, - ) - except OSError as e: - # If no changes are detected, there is nothing to commit. - if "could not read Username" in str(e): - raise OSError("Couldn't authenticate user for push. Did you set `token` to `True`?") from e - else: - raise e - - os.chdir(current_working_directory) - - def repocard_metadata_load(self) -> Optional[dict]: - filepath = os.path.join(self.local_dir, constants.REPOCARD_NAME) - if os.path.isfile(filepath): - return metadata_load(filepath) - return None - - def repocard_metadata_save(self, data: dict) -> None: - return metadata_save(os.path.join(self.local_dir, constants.REPOCARD_NAME), data) - - @property - def commands_failed(self): - """ - Returns the asynchronous commands that failed. - """ - return [c for c in self.command_queue if c.status > 0] - - @property - def commands_in_progress(self): - """ - Returns the asynchronous commands that are currently in progress. - """ - return [c for c in self.command_queue if not c.is_done] - - def wait_for_commands(self): - """ - Blocking method: blocks all subsequent execution until all commands have - been processed. - """ - index = 0 - for command_failed in self.commands_failed: - logger.error(f"The {command_failed.title} command with PID {command_failed._process.pid} failed.") - logger.error(command_failed.stderr) - - while self.commands_in_progress: - if index % 10 == 0: - logger.warning( - f"Waiting for the following commands to finish before shutting down: {self.commands_in_progress}." - ) - - index += 1 - - time.sleep(1) diff --git a/tests/test_repository.py b/tests/test_repository.py deleted file mode 100644 index 772dc9850f..0000000000 --- a/tests/test_repository.py +++ /dev/null @@ -1,895 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json -import os -import time -import unittest -from pathlib import Path - -import httpx -import pytest - -from huggingface_hub import RepoUrl -from huggingface_hub.hf_api import HfApi -from huggingface_hub.repository import ( - Repository, - is_tracked_upstream, - is_tracked_with_lfs, -) -from huggingface_hub.utils import SoftTemporaryDirectory, logging, run_subprocess - -from .testing_constants import ENDPOINT_STAGING, TOKEN -from .testing_utils import ( - expect_deprecation, - repo_name, - use_tmp_repo, - with_production_testing, -) - - -logger = logging.get_logger(__name__) - - -@pytest.mark.usefixtures("fx_cache_dir") -class RepositoryTestAbstract(unittest.TestCase): - cache_dir: Path - repo_path: Path - - # This content is 5MB (under 10MB) - small_content = json.dumps([100] * int(1e6)) - - # This content is 20MB (over 10MB) - large_content = json.dumps([100] * int(4e6)) - - # This content is binary (contains the null character) - binary_content = "\x00\x00\x00\x00" - - _api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) - - @classmethod - def setUp(self) -> None: - self.repo_path = self.cache_dir / "working_dir" - self.repo_path.mkdir() - - def _create_dummy_files(self): - # Create dummy files - # one is lfs-tracked, the other is not. - small_file = self.repo_path / "dummy.txt" - small_file.write_text(self.small_content) - - binary_file = self.repo_path / "model.bin" - binary_file.write_text(self.binary_content) - - -class TestRepositoryShared(RepositoryTestAbstract): - """Tests in this class shares a single repo on the Hub (common to all tests). - - These tests must not push data to it. - """ - - @classmethod - def setUpClass(cls): - """ - Share this valid token in all tests below. - """ - super().setUpClass() - cls.repo_url = cls._api.create_repo(repo_id=repo_name()) - cls.repo_id = cls.repo_url.repo_id - cls._api.upload_file( - path_or_fileobj=cls.binary_content.encode(), - path_in_repo="random_file.txt", - repo_id=cls.repo_id, - ) - - @classmethod - def tearDownClass(cls): - cls._api.delete_repo(repo_id=cls.repo_id) - - @expect_deprecation("Repository") - def test_clone_from_repo_url(self): - Repository(self.repo_path, clone_from=self.repo_url) - - @expect_deprecation("Repository") - def test_clone_from_repo_id(self): - Repository(self.repo_path, clone_from=self.repo_id) - - @expect_deprecation("Repository") - def test_clone_from_repo_name_no_namespace_fails(self): - with self.assertRaises(EnvironmentError): - Repository(self.repo_path, clone_from=self.repo_id.split("/")[1], token=TOKEN) - - @expect_deprecation("Repository") - def test_clone_from_not_hf_url(self): - # Should not error out - Repository(self.repo_path, clone_from="https://hf.co/hf-internal-testing/huggingface-hub-dummy-repository") - - @expect_deprecation("Repository") - def test_clone_from_missing_repo(self): - """If the repo does not exist an EnvironmentError is raised.""" - with self.assertRaises(EnvironmentError): - Repository(self.repo_path, clone_from="missing_repo") - - @expect_deprecation("Repository") - @with_production_testing - def test_clone_from_prod_canonical_repo_id(self): - Repository(self.repo_path, clone_from="bert-base-cased", skip_lfs_files=True) - - @expect_deprecation("Repository") - @with_production_testing - def test_clone_from_prod_canonical_repo_url(self): - Repository(self.repo_path, clone_from="https://huggingface.co/bert-base-cased", skip_lfs_files=True) - - @expect_deprecation("Repository") - def test_init_from_existing_local_clone(self): - run_subprocess(["git", "clone", self.repo_url, str(self.repo_path)]) - - repo = Repository(self.repo_path) - repo.lfs_track(["*.pdf"]) - repo.lfs_enable_largefiles() - repo.git_pull() - - @expect_deprecation("Repository") - def test_init_failure(self): - with self.assertRaises(ValueError): - Repository(self.repo_path) - - @expect_deprecation("Repository") - def test_init_clone_in_empty_folder(self): - repo = Repository(self.repo_path, clone_from=self.repo_url) - repo.lfs_track(["*.pdf"]) - repo.lfs_enable_largefiles() - repo.git_pull() - self.assertIn("random_file.txt", os.listdir(self.repo_path)) - - @expect_deprecation("Repository") - def test_git_lfs_filename(self): - run_subprocess("git init", folder=self.repo_path) - - repo = Repository(self.repo_path) - large_file = self.repo_path / "large_file[].txt" - large_file.write_text(self.large_content) - - repo.git_add() - - repo.lfs_track([large_file.name]) - self.assertFalse(is_tracked_with_lfs(large_file)) - - repo.lfs_track([large_file.name], filename=True) - self.assertTrue(is_tracked_with_lfs(large_file)) - - @expect_deprecation("Repository") - def test_init_clone_in_nonempty_folder(self): - self._create_dummy_files() - with self.assertRaises(EnvironmentError): - Repository(self.repo_path, clone_from=self.repo_url) - - @expect_deprecation("Repository") - def test_init_clone_in_nonempty_linked_git_repo_with_token(self): - Repository(self.repo_path, clone_from=self.repo_url, token=TOKEN) - Repository(self.repo_path, clone_from=self.repo_url, token=TOKEN) - - @expect_deprecation("Repository") - def test_is_tracked_upstream(self): - Repository(self.repo_path, clone_from=self.repo_id) - self.assertTrue(is_tracked_upstream(self.repo_path)) - - @expect_deprecation("Repository") - def test_push_errors_on_wrong_checkout(self): - repo = Repository(self.repo_path, clone_from=self.repo_id) - - head_commit_ref = run_subprocess("git show --oneline -s", folder=self.repo_path).stdout.split()[0] - - repo.git_checkout(head_commit_ref) - - with self.assertRaises(OSError): - with repo.commit("New commit"): - with open("new_file", "w+") as f: - f.write("Ok") - - -class TestRepositoryUniqueRepos(RepositoryTestAbstract): - """Tests in this class use separated repos on the Hub (i.e. 1 test = 1 repo). - - These tests can push data to it. - """ - - def setUp(self): - super().setUp() - self.repo_url = self._api.create_repo(repo_id=repo_name()) - self.repo_id = self.repo_url.repo_id - self._api.upload_file( - path_or_fileobj=self.binary_content.encode(), path_in_repo="random_file.txt", repo_id=self.repo_id - ) - - def tearDown(self): - self._api.delete_repo(repo_id=self.repo_id) - - @expect_deprecation("Repository") - def clone_repo(self, **kwargs) -> Repository: - if "local_dir" not in kwargs: - kwargs["local_dir"] = self.repo_path - if "clone_from" not in kwargs: - kwargs["clone_from"] = self.repo_url - if "token" not in kwargs: - kwargs["token"] = TOKEN - if "git_user" not in kwargs: - kwargs["git_user"] = "ci" - if "git_email" not in kwargs: - kwargs["git_email"] = "ci@dummy.com" - return Repository(**kwargs) - - @use_tmp_repo() - @expect_deprecation("Repository") - def test_init_clone_in_nonempty_non_linked_git_repo(self, repo_url: RepoUrl): - self.clone_repo() - - # Try and clone another repository within the same directory. - # Should error out due to mismatched remotes. - with self.assertRaises(EnvironmentError): - Repository(self.repo_path, clone_from=repo_url) - - def test_init_clone_in_nonempty_linked_git_repo(self): - # Clone the repository to disk - self.clone_repo() - - # Add to the remote repository without doing anything to the local repository. - self._api.upload_file( - path_or_fileobj=self.binary_content.encode(), path_in_repo="random_file_3.txt", repo_id=self.repo_id - ) - - # Cloning the repository in the same directory should not result in a git pull. - self.clone_repo(clone_from=self.repo_url) - self.assertNotIn("random_file_3.txt", os.listdir(self.repo_path)) - - def test_init_clone_in_nonempty_linked_git_repo_unrelated_histories(self): - # Clone the repository to disk - repo = self.clone_repo() - - # Create and commit file locally - (self.repo_path / "random_file_3.txt").write_text("hello world") - repo.git_add() - repo.git_commit("Unrelated commit") - - # Add to the remote repository without doing anything to the local repository. - self._api.upload_file( - path_or_fileobj=self.binary_content.encode(), - path_in_repo="random_file_3.txt", - repo_id=self.repo_url.repo_id, - ) - - # The repo should initialize correctly as the remote is the same, even with unrelated historied - self.clone_repo() - - def test_add_commit_push(self): - repo = self.clone_repo() - self._create_dummy_files() - repo.git_add() - repo.git_commit() - url = repo.git_push() - - # Check that the returned commit url - # actually exists. - r = httpx.head(url) - r.raise_for_status() - - def test_add_commit_push_non_blocking(self): - repo = self.clone_repo() - self._create_dummy_files() - repo.git_add() - repo.git_commit() - url, result = repo.git_push(blocking=False) - - # Check background process - if result._process.poll() is None: - self.assertEqual(result.status, -1) - - while not result.is_done: - time.sleep(0.5) - - self.assertTrue(result.is_done) - self.assertEqual(result.status, 0) - - # Check that the returned commit url - # actually exists. - r = httpx.head(url) - r.raise_for_status() - - def test_context_manager_non_blocking(self): - repo = self.clone_repo() - - with repo.commit("New commit", blocking=False): - (self.repo_path / "dummy.txt").write_text("hello world") - - while repo.commands_in_progress: - time.sleep(1) - - self.assertEqual(len(repo.commands_in_progress), 0) - self.assertEqual(len(repo.command_queue), 1) - self.assertEqual(repo.command_queue[-1].status, 0) - self.assertEqual(repo.command_queue[-1].is_done, True) - self.assertEqual(repo.command_queue[-1].title, "push") - - @unittest.skip("This is a flaky and legacy test") - def test_add_commit_push_non_blocking_process_killed(self): - repo = self.clone_repo() - - # Far too big file: will take forever - (self.repo_path / "dummy.txt").write_text(str([[[1] * 10000] * 1000] * 10)) - repo.git_add(auto_lfs_track=True) - repo.git_commit() - _, result = repo.git_push(blocking=False) - - result._process.kill() - - while result._process.poll() is None: - time.sleep(0.5) - - self.assertTrue(result.is_done) - self.assertEqual(result.status, -9) - - def test_commit_context_manager(self): - # Clone and commit from a first folder - folder_1 = self.repo_path / "folder_1" - clone = self.clone_repo(local_dir=folder_1) - with clone.commit("Commit"): - with open("dummy.txt", "w") as f: - f.write("hello") - with open("model.bin", "w") as f: - f.write("hello") - - # Clone in second folder. Check existence of committed files - folder_2 = self.repo_path / "folder_2" - self.clone_repo(local_dir=folder_2) - files = os.listdir(folder_2) - self.assertTrue("dummy.txt" in files) - self.assertTrue("model.bin" in files) - - def test_clone_skip_lfs_files(self): - # Upload LFS file - self._api.upload_file(path_or_fileobj=b"Bin file", path_in_repo="file.bin", repo_id=self.repo_id) - - repo = self.clone_repo(skip_lfs_files=True) - file_bin = self.repo_path / "file.bin" - - self.assertTrue(file_bin.read_text().startswith("version")) - - repo.git_pull(lfs=True) - - self.assertEqual(file_bin.read_text(), "Bin file") - - def test_commits_on_correct_branch(self): - repo = self.clone_repo() - branch = repo.current_branch - repo.git_checkout("new-branch", create_branch_ok=True) - repo.git_checkout(branch) - - with repo.commit("New commit"): - with open("file.txt", "w+") as f: - f.write("Ok") - - repo.git_checkout("new-branch") - - with repo.commit("New commit"): - with open("new_file.txt", "w+") as f: - f.write("Ok") - - with SoftTemporaryDirectory() as tmp: - clone = self.clone_repo(local_dir=tmp) - files = os.listdir(clone.local_dir) - self.assertTrue("file.txt" in files) - self.assertFalse("new_file.txt" in files) - - clone.git_checkout("new-branch") - files = os.listdir(clone.local_dir) - self.assertFalse("file.txt" in files) - self.assertTrue("new_file.txt" in files) - - def test_repo_checkout_push(self): - repo = self.clone_repo() - - repo.git_checkout("new-branch", create_branch_ok=True) - repo.git_checkout("main") - - (self.repo_path / "file.txt").write_text("OK") - - repo.push_to_hub("Commit #1") - repo.git_checkout("new-branch", create_branch_ok=True) - - (self.repo_path / "new_file.txt").write_text("OK") - - repo.push_to_hub("Commit #2") - - with SoftTemporaryDirectory() as tmp: - clone = self.clone_repo(local_dir=tmp) - files = os.listdir(clone.local_dir) - self.assertTrue("file.txt" in files) - self.assertFalse("new_file.txt" in files) - - clone.git_checkout("new-branch") - files = os.listdir(clone.local_dir) - self.assertFalse("file.txt" in files) - self.assertTrue("new_file.txt" in files) - - def test_repo_checkout_commit_context_manager(self): - repo = self.clone_repo() - - with repo.commit("Commit #1", branch="new-branch"): - with open(os.path.join(repo.local_dir, "file.txt"), "w+") as f: - f.write("Ok") - - with repo.commit("Commit #2", branch="main"): - with open(os.path.join(repo.local_dir, "new_file.txt"), "w+") as f: - f.write("Ok") - - # Maintains lastly used branch - with repo.commit("Commit #3"): - with open(os.path.join(repo.local_dir, "new_file-2.txt"), "w+") as f: - f.write("Ok") - - with SoftTemporaryDirectory() as tmp: - clone = self.clone_repo(local_dir=tmp) - files = os.listdir(clone.local_dir) - self.assertFalse("file.txt" in files) - self.assertTrue("new_file-2.txt" in files) - self.assertTrue("new_file.txt" in files) - - clone.git_checkout("new-branch") - files = os.listdir(clone.local_dir) - self.assertTrue("file.txt" in files) - self.assertFalse("new_file.txt" in files) - self.assertFalse("new_file-2.txt" in files) - - def test_add_tag(self): - repo = self.clone_repo() - repo.add_tag("v4.6.0", remote="origin") - self.assertTrue(repo.tag_exists("v4.6.0", remote="origin")) - - def test_add_annotated_tag(self): - repo = self.clone_repo() - repo.add_tag("v4.5.0", message="This is an annotated tag", remote="origin") - - # Unfortunately git offers no built-in way to check the annotated - # message of a remote tag. - # In order to check that the remote tag was correctly annotated, - # we delete the local tag before pulling the remote tag (which - # should be the same). We then check that this tag is correctly - # annotated. - repo.delete_tag("v4.5.0") - - self.assertTrue(repo.tag_exists("v4.5.0", remote="origin")) - self.assertFalse(repo.tag_exists("v4.5.0")) - - # Tag still exists on remote - run_subprocess("git pull --tags", folder=self.repo_path) - self.assertTrue(repo.tag_exists("v4.5.0")) - - # Tag is annotated - result = run_subprocess("git tag -n9", folder=self.repo_path).stdout.strip() - self.assertIn("This is an annotated tag", result) - - def test_delete_tag(self): - repo = self.clone_repo() - - repo.add_tag("v4.6.0", message="This is an annotated tag", remote="origin") - self.assertTrue(repo.tag_exists("v4.6.0", remote="origin")) - - repo.delete_tag("v4.6.0") - self.assertFalse(repo.tag_exists("v4.6.0")) - self.assertTrue(repo.tag_exists("v4.6.0", remote="origin")) - - repo.delete_tag("v4.6.0", remote="origin") - self.assertFalse(repo.tag_exists("v4.6.0", remote="origin")) - - def test_lfs_prune(self): - repo = self.clone_repo() - - with repo.commit("Committing LFS file"): - with open("file.bin", "w+") as f: - f.write("Random string 1") - - with repo.commit("Committing LFS file"): - with open("file.bin", "w+") as f: - f.write("Random string 2") - - root_directory = self.repo_path / ".git" / "lfs" - git_lfs_files_size = sum(f.stat().st_size for f in root_directory.glob("**/*") if f.is_file()) - repo.lfs_prune() - post_prune_git_lfs_files_size = sum(f.stat().st_size for f in root_directory.glob("**/*") if f.is_file()) - - # Size of the directory holding LFS files was reduced - self.assertLess(post_prune_git_lfs_files_size, git_lfs_files_size) - - def test_lfs_prune_git_push(self): - repo = self.clone_repo() - with repo.commit("Committing LFS file"): - with open("file.bin", "w+") as f: - f.write("Random string 1") - - root_directory = self.repo_path / ".git" / "lfs" - git_lfs_files_size = sum(f.stat().st_size for f in root_directory.glob("**/*") if f.is_file()) - - with open(os.path.join(repo.local_dir, "file.bin"), "w+") as f: - f.write("Random string 2") - - repo.git_add() - repo.git_commit("New commit") - repo.git_push(auto_lfs_prune=True) - - post_prune_git_lfs_files_size = sum(f.stat().st_size for f in root_directory.glob("**/*") if f.is_file()) - - # Size of the directory holding LFS files is the exact same - self.assertEqual(post_prune_git_lfs_files_size, git_lfs_files_size) - - -class TestRepositoryOffline(RepositoryTestAbstract): - """Class to test `Repository` object on local folders only (no cloning from Hub).""" - - repo: Repository - - @classmethod - @expect_deprecation("Repository") - def setUp(self) -> None: - super().setUp() - - run_subprocess("git init", folder=self.repo_path) - - self.repo = Repository(self.repo_path, git_user="ci", git_email="ci@dummy.ci") - - git_attributes_path = self.repo_path / ".gitattributes" - git_attributes_path.write_text("*.pt filter=lfs diff=lfs merge=lfs -text") - - self.repo.git_add(".gitattributes") - self.repo.git_commit("Add .gitattributes") - - def test_is_tracked_with_lfs(self): - txt_1 = self.repo_path / "small_file_1.txt" - txt_2 = self.repo_path / "small_file_2.txt" - pt_1 = self.repo_path / "model.pt" - - txt_1.write_text(self.small_content) - txt_2.write_text(self.small_content) - pt_1.write_text(self.small_content) - - self.repo.lfs_track("small_file_1.txt") - - self.assertTrue(is_tracked_with_lfs(txt_1)) - self.assertFalse(is_tracked_with_lfs(txt_2)) - self.assertTrue(pt_1) - - def test_is_tracked_with_lfs_with_pattern(self): - txt_small_file = self.repo_path / "small_file.txt" - txt_small_file.write_text(self.small_content) - - txt_large_file = self.repo_path / "large_file.txt" - txt_large_file.write_text(self.large_content) - - (self.repo_path / "dir").mkdir() - txt_small_file_in_dir = self.repo_path / "dir" / "small_file.txt" - txt_small_file_in_dir.write_text(self.small_content) - - txt_large_file_in_dir = self.repo_path / "dir" / "large_file.txt" - txt_large_file_in_dir.write_text(self.large_content) - - self.repo.auto_track_large_files("dir") - - self.assertFalse(is_tracked_with_lfs(txt_large_file)) - self.assertFalse(is_tracked_with_lfs(txt_small_file)) - self.assertTrue(is_tracked_with_lfs(txt_large_file_in_dir)) - self.assertFalse(is_tracked_with_lfs(txt_small_file_in_dir)) - - def test_auto_track_large_files(self): - txt_small_file = self.repo_path / "small_file.txt" - txt_small_file.write_text(self.small_content) - - txt_large_file = self.repo_path / "large_file.txt" - txt_large_file.write_text(self.large_content) - - self.repo.auto_track_large_files() - - self.assertTrue(is_tracked_with_lfs(txt_large_file)) - self.assertFalse(is_tracked_with_lfs(txt_small_file)) - - def test_auto_track_binary_files(self): - non_binary_file = self.repo_path / "non_binary_file.txt" - non_binary_file.write_text(self.small_content) - - binary_file = self.repo_path / "binary_file.txt" - binary_file.write_text(self.binary_content) - - self.repo.auto_track_binary_files() - - self.assertFalse(is_tracked_with_lfs(non_binary_file)) - self.assertTrue(is_tracked_with_lfs(binary_file)) - - def test_auto_track_large_files_ignored_with_gitignore(self): - (self.repo_path / "dir").mkdir() - - # Test nested gitignores - gitignore_file = self.repo_path / ".gitignore" - gitignore_file.write_text("large_file.txt") - - gitignore_file_in_dir = self.repo_path / "dir" / ".gitignore" - gitignore_file_in_dir.write_text("large_file_3.txt") - - large_file = self.repo_path / "large_file.txt" - large_file.write_text(self.large_content) - - large_file_2 = self.repo_path / "large_file_2.txt" - large_file_2.write_text(self.large_content) - - large_file_3 = self.repo_path / "dir" / "large_file_3.txt" - large_file_3.write_text(self.large_content) - - large_file_4 = self.repo_path / "dir" / "large_file_4.txt" - large_file_4.write_text(self.large_content) - - self.repo.auto_track_large_files() - - # Large files - self.assertFalse(is_tracked_with_lfs(large_file)) - self.assertTrue(is_tracked_with_lfs(large_file_2)) - - self.assertFalse(is_tracked_with_lfs(large_file_3)) - self.assertTrue(is_tracked_with_lfs(large_file_4)) - - def test_auto_track_binary_files_ignored_with_gitignore(self): - (self.repo_path / "dir").mkdir() - - # Test nested gitignores - gitignore_file = self.repo_path / ".gitignore" - gitignore_file.write_text("binary_file.txt") - - gitignore_file_in_dir = self.repo_path / "dir" / ".gitignore" - gitignore_file_in_dir.write_text("binary_file_3.txt") - - binary_file = self.repo_path / "binary_file.txt" - binary_file.write_text(self.binary_content) - - binary_file_2 = self.repo_path / "binary_file_2.txt" - binary_file_2.write_text(self.binary_content) - - binary_file_3 = self.repo_path / "dir" / "binary_file_3.txt" - binary_file_3.write_text(self.binary_content) - - binary_file_4 = self.repo_path / "dir" / "binary_file_4.txt" - binary_file_4.write_text(self.binary_content) - - self.repo.auto_track_binary_files() - - # Binary files - self.assertFalse(is_tracked_with_lfs(binary_file)) - self.assertTrue(is_tracked_with_lfs(binary_file_2)) - self.assertFalse(is_tracked_with_lfs(binary_file_3)) - self.assertTrue(is_tracked_with_lfs(binary_file_4)) - - def test_auto_track_large_files_through_git_add(self): - txt_small_file = self.repo_path / "small_file.txt" - txt_small_file.write_text(self.small_content) - - txt_large_file = self.repo_path / "large_file.txt" - txt_large_file.write_text(self.large_content) - - self.repo.git_add(auto_lfs_track=True) - - self.assertTrue(is_tracked_with_lfs(txt_large_file)) - self.assertFalse(is_tracked_with_lfs(txt_small_file)) - - def test_auto_track_binary_files_through_git_add(self): - non_binary_file = self.repo_path / "small_file.txt" - non_binary_file.write_text(self.small_content) - - binary_file = self.repo_path / "binary.txt" - binary_file.write_text(self.binary_content) - - self.repo.git_add(auto_lfs_track=True) - - self.assertTrue(is_tracked_with_lfs(binary_file)) - self.assertFalse(is_tracked_with_lfs(non_binary_file)) - - def test_auto_no_track_large_files_through_git_add(self): - txt_small_file = self.repo_path / "small_file.txt" - txt_small_file.write_text(self.small_content) - - txt_large_file = self.repo_path / "large_file.txt" - txt_large_file.write_text(self.large_content) - - self.repo.git_add(auto_lfs_track=False) - - self.assertFalse(is_tracked_with_lfs(txt_large_file)) - self.assertFalse(is_tracked_with_lfs(txt_small_file)) - - def test_auto_no_track_binary_files_through_git_add(self): - non_binary_file = self.repo_path / "small_file.txt" - non_binary_file.write_text(self.small_content) - - binary_file = self.repo_path / "binary.txt" - binary_file.write_text(self.binary_content) - - self.repo.git_add(auto_lfs_track=False) - - self.assertFalse(is_tracked_with_lfs(binary_file)) - self.assertFalse(is_tracked_with_lfs(non_binary_file)) - - def test_auto_track_updates_removed_gitattributes(self): - txt_small_file = self.repo_path / "small_file.txt" - txt_small_file.write_text(self.small_content) - - txt_large_file = self.repo_path / "large_file.txt" - txt_large_file.write_text(self.large_content) - - self.repo.git_add(auto_lfs_track=True) - - self.assertTrue(is_tracked_with_lfs(txt_large_file)) - self.assertFalse(is_tracked_with_lfs(txt_small_file)) - - # Remove large file - txt_large_file.unlink() - - # Auto track should remove the entry from .gitattributes - self.repo.auto_track_large_files() - - # Recreate the large file with smaller contents - txt_large_file.write_text(self.small_content) - - # Ensure the file is not LFS tracked anymore - self.repo.auto_track_large_files() - self.assertFalse(is_tracked_with_lfs(txt_large_file)) - - def test_checkout_non_existing_branch(self): - self.assertRaises(EnvironmentError, self.repo.git_checkout, "brand-new-branch") - - def test_checkout_new_branch(self): - self.repo.git_checkout("new-branch", create_branch_ok=True) - self.assertEqual(self.repo.current_branch, "new-branch") - - def test_is_not_tracked_upstream(self): - self.repo.git_checkout("new-branch", create_branch_ok=True) - self.assertFalse(is_tracked_upstream(self.repo.local_dir)) - - def test_no_branch_checked_out_raises(self): - head_commit_ref = run_subprocess("git show --oneline -s", folder=self.repo_path).stdout.split()[0] - - self.repo.git_checkout(head_commit_ref) - self.assertRaises(OSError, is_tracked_upstream, self.repo.local_dir) - - @expect_deprecation("Repository") - def test_repo_init_checkout_default_revision(self): - # Instantiate repository on a given revision - repo = Repository(self.repo_path, revision="new-branch") - self.assertEqual(repo.current_branch, "new-branch") - - # The revision should be kept when re-initializing the repo - repo_2 = Repository(self.repo_path) - self.assertEqual(repo_2.current_branch, "new-branch") - - @expect_deprecation("Repository") - def test_repo_init_checkout_revision(self): - current_head_hash = self.repo.git_head_hash() - - (self.repo_path / "file.txt").write_text("hello world") - - self.repo.git_add() - self.repo.git_commit("Add file.txt") - - new_head_hash = self.repo.git_head_hash() - - self.assertNotEqual(current_head_hash, new_head_hash) - - previous_head_repo = Repository(self.repo_path, revision=current_head_hash) - files = os.listdir(previous_head_repo.local_dir) - self.assertNotIn("file.txt", files) - - current_head_repo = Repository(self.repo_path, revision=new_head_hash) - files = os.listdir(current_head_repo.local_dir) - self.assertIn("file.txt", files) - - @expect_deprecation("Repository") - def test_repo_user(self): - _ = Repository(self.repo_path, token=TOKEN) - username = run_subprocess("git config user.name", folder=self.repo_path).stdout - email = run_subprocess("git config user.email", folder=self.repo_path).stdout - - # hardcode values to avoid another api call to whoami - self.assertEqual(username.strip(), "Dummy User") - self.assertEqual(email.strip(), "julien@huggingface.co") - - @expect_deprecation("Repository") - def test_repo_passed_user(self): - _ = Repository(self.repo_path, token=TOKEN, git_user="RANDOM_USER", git_email="EMAIL@EMAIL.EMAIL") - username = run_subprocess("git config user.name", folder=self.repo_path).stdout - email = run_subprocess("git config user.email", folder=self.repo_path).stdout - - self.assertEqual(username.strip(), "RANDOM_USER") - self.assertEqual(email.strip(), "EMAIL@EMAIL.EMAIL") - - def test_add_tag(self): - self.repo.add_tag("v4.6.0") - self.assertTrue(self.repo.tag_exists("v4.6.0")) - - def test_add_annotated_tag(self): - self.repo.add_tag("v4.6.0", message="This is an annotated tag") - self.assertTrue(self.repo.tag_exists("v4.6.0")) - - result = run_subprocess("git tag -n9", folder=self.repo_path).stdout.strip() - self.assertIn("This is an annotated tag", result) - - def test_delete_tag(self): - self.repo.add_tag("v4.6.0", message="This is an annotated tag") - self.assertTrue(self.repo.tag_exists("v4.6.0")) - - self.repo.delete_tag("v4.6.0") - self.assertFalse(self.repo.tag_exists("v4.6.0")) - - def test_repo_clean(self): - self.assertTrue(self.repo.is_repo_clean()) - (self.repo_path / "file.txt").write_text("hello world") - self.assertFalse(self.repo.is_repo_clean()) - - -class TestRepositoryDataset(RepositoryTestAbstract): - """Class to test that cloning from a different repo_type works fine.""" - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.repo_url = cls._api.create_repo(repo_id=repo_name(), repo_type="dataset") - cls.repo_id = cls.repo_url.repo_id - cls._api.upload_file( - path_or_fileobj=cls.binary_content.encode(), - path_in_repo="file.txt", - repo_id=cls.repo_id, - repo_type="dataset", - ) - - @classmethod - def tearDownClass(cls): - super().tearDownClass() - cls._api.delete_repo(repo_id=cls.repo_id, repo_type="dataset") - - @expect_deprecation("Repository") - def test_clone_dataset_with_endpoint_explicit_repo_type(self): - Repository( - self.repo_path, clone_from=self.repo_url, repo_type="dataset", git_user="ci", git_email="ci@dummy.com" - ) - self.assertTrue((self.repo_path / "file.txt").exists()) - - @expect_deprecation("Repository") - def test_clone_dataset_with_endpoint_implicit_repo_type(self): - self.assertIn("dataset", self.repo_url) # Implicit - Repository(self.repo_path, clone_from=self.repo_url, git_user="ci", git_email="ci@dummy.com") - self.assertTrue((self.repo_path / "file.txt").exists()) - - @expect_deprecation("Repository") - def test_clone_dataset_with_repo_id_and_repo_type(self): - Repository( - self.repo_path, clone_from=self.repo_id, repo_type="dataset", git_user="ci", git_email="ci@dummy.com" - ) - self.assertTrue((self.repo_path / "file.txt").exists()) - - @expect_deprecation("Repository") - def test_clone_dataset_no_ci_user_and_email(self): - Repository(self.repo_path, clone_from=self.repo_id, repo_type="dataset") - self.assertTrue((self.repo_path / "file.txt").exists()) - - @expect_deprecation("Repository") - def test_clone_dataset_with_repo_name_and_repo_type_fails(self): - with self.assertRaises(EnvironmentError): - Repository( - self.repo_path, - clone_from=self.repo_id.split("/")[1], - repo_type="dataset", - token=TOKEN, - git_user="ci", - git_email="ci@dummy.com", - )