Merge branch 'huggingface:main' into main

huggingface · Aug 30, 2024 · bec57df · bec57df
2 parents e531ce7 + c02b7c3
commit bec57df
Show file tree

Hide file tree

Showing 115 changed files with 9,427 additions and 253 deletions.
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -18,9 +18,9 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest] # For now, only test on Linux
-    steps: 
+    steps:
       - name: Checkout repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Install Rust
         uses: actions-rs/toolchain@v1
@@ -65,4 +65,4 @@ jobs:
         working-directory: ./candle-pyo3
         run: |
           source .env/bin/activate
-          python -m pytest -s -v tests
+          python -m pytest -s -v tests
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
@@ -1,6 +1,6 @@
-on: 
+on:
   push:
-    branches: 
+    branches:
       - main
   pull_request:
 
@@ -15,7 +15,7 @@ jobs:
         os: [ubuntu-latest, windows-latest, macOS-latest]
         rust: [stable]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
@@ -34,7 +34,7 @@ jobs:
         os: [ubuntu-latest, windows-latest, macOS-latest]
         rust: [stable]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
@@ -49,7 +49,7 @@ jobs:
     name: Rustfmt
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
@@ -65,7 +65,7 @@ jobs:
     name: Clippy
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal

diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,6 @@ candle-wasm-examples/*/package-lock.json
 candle-wasm-examples/**/config*.json
 .DS_Store
 .idea/*
+__pycache__
+out.safetensors
+out.wav
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,7 +20,7 @@ exclude = [
 resolver = "2"
 
 [workspace.package]
-version = "0.6.0"
+version = "0.6.1"
 edition = "2021"
 description = "Minimalist ML framework."
 repository = "https://github.com/huggingface/candle"
@@ -33,23 +33,23 @@ ab_glyph = "0.2.23"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
-candle = { path = "./candle-core", package = "candle-core", version = "0.6.0" }
-candle-datasets = { path = "./candle-datasets", version = "0.6.0" }
-candle-flash-attn = { path = "./candle-flash-attn", version = "0.6.0" }
-candle-kernels = { path = "./candle-kernels", version = "0.6.0" }
-candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.6.0" }
-candle-nn = { path = "./candle-nn", version = "0.6.0" }
-candle-onnx = { path = "./candle-onnx", version = "0.6.0" }
-candle-transformers = { path = "./candle-transformers", version = "0.6.0" }
+candle = { path = "./candle-core", package = "candle-core", version = "0.6.1" }
+candle-datasets = { path = "./candle-datasets", version = "0.6.1" }
+candle-flash-attn = { path = "./candle-flash-attn", version = "0.6.1" }
+candle-kernels = { path = "./candle-kernels", version = "0.6.1" }
+candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.6.1" }
+candle-nn = { path = "./candle-nn", version = "0.6.1" }
+candle-onnx = { path = "./candle-onnx", version = "0.6.1" }
+candle-transformers = { path = "./candle-transformers", version = "0.6.1" }
 clap = { version = "4.2.4", features = ["derive"] }
 criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "=0.11.6", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
+cudarc = { version = "0.12.0", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
 fancy-regex = "0.13.0"
 gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
 hf-hub = "0.3.0"
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
 hound = "3.5.1"
-image = { version = "0.25.0", default-features = false, features = ["jpeg", "png"] }
+image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }

diff --git a/README.md b/README.md
@@ -63,7 +63,9 @@ We also provide a some command line based examples using state of the art models
 - [LLaMA v1, v2, and v3](./candle-examples/examples/llama/): general LLM, includes
   the SOLAR-10.7B variant.
 - [Falcon](./candle-examples/examples/falcon/): general LLM.
-- [Gemma](./candle-examples/examples/gemma/): 2b and 7b general LLMs from Google Deepmind.
+- [Codegeex4](./candle-examples/examples/codegeex4-9b/): Code completion,code interpreter,web search,fuction calling,repository-level
+- [GLM4](./candle-examples/examples/glm4/): Open Multilingual Multimodal Chat LMs by THUDM
+- [Gemma v1 and v2](./candle-examples/examples/gemma/): 2b and 7b+/9b general LLMs from Google Deepmind.
 - [RecurrentGemma](./candle-examples/examples/recurrent-gemma/): 2b and 7b
   Griffin based models from Google that mix attention with a RNN like state.
 - [Phi-1, Phi-1.5, Phi-2, and Phi-3](./candle-examples/examples/phi/): 1.3b,
@@ -118,6 +120,8 @@ We also provide a some command line based examples using state of the art models
   model using residual vector quantization.
 - [MetaVoice](./candle-examples/examples/metavoice/): foundational model for
   text-to-speech.
+- [Parler-TTS](./candle-examples/examples/parler-tts/): large text-to-speech
+  model.
 - [T5](./candle-examples/examples/t5), [Bert](./candle-examples/examples/bert/),
   [JinaBert](./candle-examples/examples/jina-bert/) : useful for sentence embeddings.
 - [DINOv2](./candle-examples/examples/dinov2/): computer vision model trained
@@ -206,7 +210,7 @@ If you have an addition to this list, please submit a pull request.
         - StarCoder, StarCoder2.
         - Phi 1, 1.5, 2, and 3.
         - Mamba, Minimal Mamba
-        - Gemma 2b and 7b.
+        - Gemma v1 2b and 7b+, v2 2b and 9b.
         - Mistral 7b v0.1.
         - Mixtral 8x7b v0.1.
         - StableLM-3B-4E1T, StableLM-2-1.6B, Stable-Code-3B.
@@ -234,9 +238,10 @@ If you have an addition to this list, please submit a pull request.
         - Whisper, multi-lingual speech-to-text.
         - EnCodec, audio compression model.
         - MetaVoice-1B, text-to-speech model.
+        - Parler-TTS, text-to-speech model.
     - Computer Vision Models.
         - DINOv2, ConvMixer, EfficientNet, ResNet, ViT, VGG, RepVGG, ConvNeXT,
-          ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4.
+          ConvNeXTv2, MobileOne, EfficientVit (MSRA), MobileNetv4, Hiera, FastViT.
         - yolo-v3, yolo-v8.
         - Segment-Anything Model (SAM).
         - SegFormer.

diff --git a/candle-core/benches/benchmarks/affine.rs b/candle-core/benches/benchmarks/affine.rs
@@ -12,7 +12,7 @@ fn run_affine_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name:
     let m = 1024;
     let k = 1024;
 
-    let tensor = Tensor::zeros((b, m, k), dtype, &device).unwrap();
+    let tensor = Tensor::zeros((b, m, k), dtype, device).unwrap();
 
     let flops = b * m * k * dtype.size_in_bytes();
 

diff --git a/candle-core/benches/benchmarks/qmatmul.rs b/candle-core/benches/benchmarks/qmatmul.rs
@@ -7,7 +7,7 @@ use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 
 fn run(matmul: &QMatMul, x: &Tensor) {
-    matmul.forward(&x).unwrap();
+    matmul.forward(x).unwrap();
 }
 
 fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
@@ -50,7 +50,7 @@ fn run_bench(c: &mut Criterion, device: &Device, dtype: GgmlDType) {
 fn criterion_benchmark(c: &mut Criterion) {
     let handler = BenchDeviceHandler::new().unwrap();
     for device in handler.devices {
-        for dtype in vec![
+        for dtype in [
             GgmlDType::F32,
             GgmlDType::F16,
             GgmlDType::Q4_0,

diff --git a/candle-core/benches/benchmarks/unary.rs b/candle-core/benches/benchmarks/unary.rs
@@ -12,7 +12,7 @@ fn run_unary_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &
     let m = 1024;
     let k = 1024;
 
-    let tensor = Tensor::arange(0.0f32, (b * m * k) as f32, &device)
+    let tensor = Tensor::arange(0.0f32, (b * m * k) as f32, device)
         .unwrap()
         .to_dtype(dtype)
         .unwrap()

diff --git a/candle-core/benches/benchmarks/where_cond.rs b/candle-core/benches/benchmarks/where_cond.rs
@@ -25,9 +25,9 @@ const SIZE: usize = B * M * K;
 const DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
 
 fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
-    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), &device).unwrap();
-    let on_true = Tensor::ones((B, M, K), dtype, &device).unwrap();
-    let on_false = Tensor::zeros((B, M, K), dtype, &device).unwrap();
+    let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), device).unwrap();
+    let on_true = Tensor::ones((B, M, K), dtype, device).unwrap();
+    let on_false = Tensor::zeros((B, M, K), dtype, device).unwrap();
 
     let elements = B * M * K;
     // E.g. 2 f32 tensors + 1 u8 tensor

diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs
@@ -623,9 +623,9 @@ impl Tensor {
                     }
                     Op::Unary(arg, UnaryOp::Silu) => {
                         let sum_grad = grads.or_insert(arg)?;
-                        // d/dx silu = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
+                        // d/dx silu = sigmoid(x) * (1 + x * (1 - sigmoid(x))) = sigmoid(x) * (1 - node) + node
                         let sigmoid_arg = (arg.neg()?.exp()? + 1.)?.recip()?;
-                        let silu_grad = (&sigmoid_arg * (1. + (arg * (1. - &sigmoid_arg)?)?)?)?;
+                        let silu_grad = &sigmoid_arg * (1. - *node) + *node;
                         *sum_grad = sum_grad.add(&(&grad * silu_grad)?)?
                     }
                     Op::Elu(arg, alpha) => {
@@ -756,4 +756,9 @@ impl GradStore {
         };
         Ok(grad)
     }
+
+    /// Get the tensor ids of the stored gradient tensors
+    pub fn get_ids(&self) -> impl Iterator<Item = &TensorId> {
+        self.0.keys()
+    }
 }
diff --git a/candle-core/src/cuda_backend/cudnn.rs b/candle-core/src/cuda_backend/cudnn.rs
@@ -1,6 +1,6 @@
 use crate::WithDType;
 use cudarc;
-use cudarc::cudnn::safe::{Conv2dForward, Cudnn};
+use cudarc::cudnn::safe::{ConvForward, Cudnn};
 use cudarc::driver::{CudaSlice, CudaView, DeviceRepr, ValidAsZeroBits};
 use std::cell::RefCell;
 use std::collections::HashMap;
@@ -87,7 +87,7 @@ pub(crate) fn launch_conv2d<
         cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
         [params.b_size as i32, params.c_out as i32, h_out, w_out],
     )?;
-    let conv2d = Conv2dForward {
+    let conv2d = ConvForward {
         conv: &conv,
         x: &x,
         w: &w,

diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs
@@ -174,6 +174,7 @@ impl Map1 for Im2Col1D {
     }
 }
 
+#[allow(unused)]
 struct Im2Col {
     h_k: usize,
     w_k: usize,
@@ -183,6 +184,7 @@ struct Im2Col {
 }
 
 impl Im2Col {
+    #[allow(unused)]
     fn hw_out(&self, h: usize, w: usize) -> (usize, usize) {
         let h_out = (h + 2 * self.padding - self.dilation * (self.h_k - 1) - 1) / self.stride + 1;
         let w_out = (w + 2 * self.padding - self.dilation * (self.w_k - 1) - 1) / self.stride + 1;

diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs
@@ -171,6 +171,22 @@ impl Device {
         matches!(self, Self::Metal(_))
     }
 
+    pub fn supports_bf16(&self) -> bool {
+        match self {
+            Self::Cuda(_) => true,
+            Self::Metal(_) | Self::Cpu => false,
+        }
+    }
+
+    /// Return `BF16` for devices that support it, otherwise default to `F32`.
+    pub fn bf16_default_to_f32(&self) -> DType {
+        if self.supports_bf16() {
+            DType::BF16
+        } else {
+            DType::F32
+        }
+    }
+
     pub fn cuda_if_available(ordinal: usize) -> Result<Self> {
         if crate::utils::cuda_is_available() {
             Self::new_cuda(ordinal)