diff --git a/common/arg.cpp b/common/arg.cpp
index 3c169b5b5f4..aa3090d4f1d 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 
+#include "common.h"
 #include "log.h"
 #include "sampling.h"
 #include "chat.h"
@@ -322,6 +323,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         params.kv_overrides.back().key[0] = 0;
     }
 
+    if (!params.tensor_buft_overrides.empty()) {
+        params.tensor_buft_overrides.push_back({nullptr, nullptr});
+    }
+
     if (params.reranking && params.embedding) {
         throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
     }
@@ -1615,6 +1620,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             exit(0);
         }
     ));
+    add_opt(common_arg(
+        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
+        "override tensor buffer type", [](common_params & params, const std::string & value) {
+            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+            if (buft_list.empty()) {
+                // enumerate all the devices and add their buffer types to the list
+                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    auto * buft = ggml_backend_dev_buffer_type(dev);
+                    if (buft) {
+                        buft_list[ggml_backend_buft_name(buft)] = buft;
+                    }
+                }
+            }
+
+            for (const auto & override : string_split<std::string>(value, ',')) {
+                std::string::size_type pos = override.find('=');
+                if (pos == std::string::npos) {
+                    throw std::invalid_argument("invalid value");
+                }
+                std::string tensor_name = override.substr(0, pos);
+                std::string buffer_type = override.substr(pos + 1);
+
+                if (buft_list.find(buffer_type) == buft_list.end()) {
+                    printf("Available buffer types:\n");
+                    for (const auto & it : buft_list) {
+                        printf("  %s\n", ggml_backend_buft_name(it.second));
+                    }
+                    throw std::invalid_argument("unknown buffer type");
+                }
+                // FIXME: this leaks memory
+                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+            }
+        }
+    ));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
diff --git a/common/common.cpp b/common/common.cpp
index d2b0d50e3ee..7a2da4e7195 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1082,15 +1082,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     if (!params.devices.empty()) {
         mparams.devices = params.devices.data();
     }
+
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
     }
+
     mparams.main_gpu        = params.main_gpu;
     mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
+
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
@@ -1098,6 +1101,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
         mparams.kv_overrides = params.kv_overrides.data();
     }
 
+    if (params.tensor_buft_overrides.empty()) {
+        mparams.tensor_buft_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
+        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
+    }
+
     return mparams;
 }
 
diff --git a/common/common.h b/common/common.h
index efe8e7f7965..83602b7ca1f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -272,6 +272,7 @@ struct common_params {
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
 
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
     std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 8b7c75d85a6..3db21d178ac 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5109,4 +5109,4 @@ def main() -> None:
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file
diff --git a/examples/server/lock-free.hpp b/examples/server/lock-free.hpp
new file mode 100644
index 00000000000..c0b872a0944
--- /dev/null
+++ b/examples/server/lock-free.hpp
@@ -0,0 +1,881 @@
+/*
+  hash_map -- Lock-Free Hash Map port from folly::AtomicUnorderedInsertMap for C++.
+
+  Copyright (c) 2010-2017 <http://ez8.co> <orca.zhang@yahoo.com>
+
+  This library is released under the MIT License.
+  Please see LICENSE file or visit https://github.com/ez8-co/atomic for details.
+ */
+#pragma once
+
+#include <stdexcept>
+#include <memory>
+#include <cstring>
+#include <cassert>
+#include <cstdio>
+
+#ifdef _MSC_VER
+  #include <intrin.h>
+  #define LIKELY(x)                   (x)
+  #define UNLIKELY(x)                 (x)
+#else
+  #define LIKELY(x)                   (__builtin_expect((x), 1))
+  #define UNLIKELY(x)                 (__builtin_expect((x), 0))
+#endif
+
+#if __cplusplus >= 201103L || _MSC_VER >= 1700
+  #include <atomic>
+#else
+namespace std {
+
+  typedef enum memory_order {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+  } memory_order;
+
+#ifdef _MSC_VER
+  template <typename T, size_t N = sizeof(T)>
+  struct interlocked {};
+
+  template <typename T>
+  struct interlocked<T, 4> {
+    static inline T incre(T volatile* x) {
+      return static_cast<T>(_InterlockedIncrement(reinterpret_cast<volatile long*>(x)));
+    }
+    static inline T decre(T volatile* x) {
+      return static_cast<T>(_InterlockedDecrement(reinterpret_cast<volatile long*>(x)));
+    }
+    static inline T add(T volatile* x, T delta) {
+      return static_cast<T>(_InterlockedExchangeAdd(reinterpret_cast<volatile long*>(x), delta));
+    }
+    static inline T compare_exchange(T volatile* x, const T new_val, const T expected_val) {
+      return static_cast<T>(
+        _InterlockedCompareExchange(reinterpret_cast<volatile long*>(x),
+          static_cast<const long>(new_val), static_cast<const long>(expected_val)));
+    }
+    static inline T exchange(T volatile* x, const T new_val) {
+      return static_cast<T>(
+        _InterlockedExchange(
+          reinterpret_cast<volatile long*>(x), static_cast<const long>(new_val)));
+    }
+  };
+
+  template <typename T>
+  struct interlocked<T, 8> {
+    static inline T incre(T volatile* x) {
+#ifdef WIN64
+      return static_cast<T>(_InterlockedIncrement64(reinterpret_cast<volatile __int64*>(x)));
+#else
+      return add(x, 1);
+#endif  // WIN64
+    }
+    static inline T decre(T volatile* x) {
+#ifdef WIN64
+      return static_cast<T>(_InterlockedDecrement64(reinterpret_cast<volatile __int64*>(x)));
+#else
+      return add(x, -1);
+#endif  // WIN64
+    }
+    static inline T add(T volatile* x, T delta) {
+#ifdef WIN64
+      return static_cast<T>(_InterlockedExchangeAdd64(reinterpret_cast<volatile __int64*>(x), delta));
+#else
+      __int64 old_val, new_val;
+      do {
+        old_val = static_cast<__int64>(*x);
+        new_val = old_val + static_cast<__int64>(delta);
+      } while (_InterlockedCompareExchange64(
+                 reinterpret_cast<volatile __int64*>(x), new_val, old_val) !=
+               old_val);
+      return static_cast<T>(new_val);
+#endif  // WIN64
+    }
+    static inline T compare_exchange(T volatile* x, const T new_val, const T expected_val) {
+      return static_cast<T>(
+        _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(x), 
+          static_cast<const __int64>(new_val), static_cast<const __int64>(expected_val)));
+    }
+    static inline T exchange(T volatile* x, const T new_val) {
+#ifdef WIN64
+      return static_cast<T>(
+        _InterlockedExchange64(reinterpret_cast<volatile __int64*>(x),
+          static_cast<const __int64>(new_val)));
+#else
+      __int64 old_val;
+      do {
+        old_val = static_cast<__int64>(*x);
+      } while (_InterlockedCompareExchange64(
+                 reinterpret_cast<volatile __int64*>(x), new_val, old_val) !=
+               old_val);
+      return static_cast<T>(old_val);
+#endif  // WIN64
+    }
+  };
+
+#else
+
+  template<typename>
+  struct hash {};
+
+  template<>
+  struct hash<size_t> {
+    inline size_t operator()(size_t v) const { return v; }
+  };
+
+#endif
+
+  template <typename T>
+  class atomic {
+  public:
+    atomic() : value_(static_cast<T>(0)) {}
+    explicit atomic(const T value) : value_(value) {}
+
+    T operator++() {
+  #ifdef _MSC_VER
+      return interlocked<T>::incre(&value_);
+  #else
+      return __atomic_add_fetch(&value_, 1, __ATOMIC_SEQ_CST);
+  #endif
+    }
+
+    T operator++(int) {
+      T v = load(); ++(*this); return v;
+    }
+
+    T operator--() {
+  #ifdef _MSC_VER
+      return interlocked<T>::decre(&value_);
+  #else
+      return __atomic_sub_fetch(&value_, 1, __ATOMIC_SEQ_CST);
+  #endif
+    }
+
+    T operator+=(T v) {
+  #ifdef _MSC_VER
+      return interlocked<T>::add(&value_, v);
+  #else
+      return __atomic_add_fetch(&value_, v, __ATOMIC_SEQ_CST);
+  #endif
+    }
+
+    bool compare_exchange_strong(T& expected_val, T new_val, memory_order order = memory_order_seq_cst) {
+  #ifdef _MSC_VER
+      return expected_val == interlocked<T>::compare_exchange(&value_, new_val, expected_val);
+  #else
+      return __atomic_compare_exchange_n(&value_, &expected_val, new_val, 0, order, __ATOMIC_SEQ_CST);
+  #endif
+    }
+
+    void store(const T new_val, memory_order order = memory_order_seq_cst) {
+  #ifdef _MSC_VER
+      interlocked<T>::exchange(&value_, new_val);
+  #else
+      __atomic_store_n(&value_, new_val, order);
+  #endif
+    }
+
+    T load(memory_order order = memory_order_seq_cst) const {
+  #ifdef _MSC_VER
+      return interlocked<T>::add(const_cast<volatile T*>(&value_), 0);
+  #else
+      return __atomic_load_n(&value_, order);
+  #endif
+    }
+
+    T operator=(const T new_value) {
+      store(new_value);
+      return new_value;
+    }
+
+    operator T() const {
+      return load();
+    }
+
+  private:
+    volatile T value_;
+  };
+}
+#endif
+
+/*
+* Copyright 2013-present Facebook, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+namespace lock_free {
+
+  size_t nextPowTwo(size_t v) {
+  #ifdef _MSC_VER
+    unsigned long x = 0;
+    _BitScanForward(&x, v - 1);
+  #else
+    int x = __builtin_clzll(v - 1);
+  #endif
+    return v ? (size_t(1) << (v - 1 ? (((sizeof(unsigned long long) << 3) - 1) ^ x) + 1 : 0)) : 1;
+  }
+
+  template <
+    typename Key,
+    typename Value,
+    typename Hash = std::hash<Key>,
+    typename KeyEqual = std::equal_to<Key>,
+    template <typename> class Atom = std::atomic,
+    typename IndexType = size_t,
+    typename Allocator = std::allocator<char> >
+
+  struct hash_map {
+
+  typedef Key key_type;
+  typedef Value mapped_type;
+  typedef std::pair<Key,Value> value_type;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef Hash hasher;
+  typedef KeyEqual key_equal;
+  typedef const value_type& const_reference;
+
+  typedef struct ConstIterator : public std::iterator<std::bidirectional_iterator_tag, value_type> {
+    ConstIterator(const hash_map& owner, IndexType slot)
+      : owner_(owner)
+      , slot_(slot)
+    {}
+
+    const value_type& operator*() const {
+      return owner_.slots_[slot_].keyValue();
+    }
+
+    const value_type* operator->() const {
+      return &owner_.slots_[slot_].keyValue();
+    }
+
+    // pre-increment
+    const ConstIterator& operator++() {
+      while (slot_ > 0) {
+        --slot_;
+        if (owner_.slots_[slot_].state() == LINKED) {
+          break;
+        }
+      }
+      return *this;
+    }
+
+    // post-increment
+    ConstIterator operator++(int /* dummy */) {
+      ConstIterator prev = *this;
+      ++*this;
+      return prev;
+    }
+
+    bool operator==(const ConstIterator& rhs) const {
+      return slot_ == rhs.slot_;
+    }
+    bool operator!=(const ConstIterator& rhs) const {
+      return !(*this == rhs);
+    }
+
+  private:
+    const hash_map& owner_;
+    IndexType slot_;
+  } const_iterator;
+
+  friend ConstIterator;
+
+  hash_map(size_t maxSize,
+           float maxLoadFactor = 0.8f,
+           const Allocator& alloc = Allocator())
+    : allocator_(alloc)
+  {
+    size_t capacity = size_t(maxSize / (maxLoadFactor > 1.0f ? 1.0f : maxLoadFactor) + 128);
+    size_t avail = size_t(1) << (8 * sizeof(IndexType) - 2);
+    if (capacity > avail && maxSize < avail) {
+      // we'll do our best
+      capacity = avail;
+    }
+    if (capacity < maxSize || capacity > avail) {
+      throw std::invalid_argument(
+        "hash_map capacity must fit in IndexType with 2 bits "
+        "left over");
+    }
+
+    numSlots_ = capacity;
+    slotMask_ = nextPowTwo(capacity * 4) - 1;
+    mmapRequested_ = sizeof(Slot) * capacity;
+    slots_ = reinterpret_cast<Slot*>(allocator_.allocate(mmapRequested_));
+    memset(slots_, 0, mmapRequested_);
+    // mark the zero-th slot as in-use but not valid, since that happens
+    // to be our nil value
+    slots_[0].stateUpdate(EMPTY, CONSTRUCTING);
+  }
+
+  ~hash_map() {
+    for (size_t i = 1; i < numSlots_; ++i) {
+      slots_[i].~Slot();
+    }
+    allocator_.deallocate(reinterpret_cast<char*>(slots_), mmapRequested_);
+  }
+
+  template <typename Func, typename V>
+  std::pair<const_iterator, bool> findOrConstruct(const Key& key, Func func, const V* value) {
+    IndexType const slot = keyToSlotIdx(key);
+    IndexType prev = slots_[slot].headAndState_.load(std::memory_order_acquire);
+
+    IndexType existing = find(key, slot);
+    if (existing)
+      return std::make_pair(ConstIterator(*this, existing), false);
+
+    IndexType idx = allocateNear(slot);
+    // allocaion failed, return fake element
+    if (!idx)
+      return std::make_pair(ConstIterator(*this, idx), false);
+    new (&slots_[idx].keyValue().first) Key(key);
+    func(static_cast<void*>(&slots_[idx].keyValue().second), value);
+
+    while (true) {
+      slots_[idx].next_ = prev >> 2;
+
+      // we can merge the head update and the CONSTRUCTING -> LINKED update
+      // into a single CAS if slot == idx (which should happen often)
+      IndexType after = idx << 2;
+      if (slot == idx)
+        after += LINKED;
+      else
+        after += (prev & 3);
+
+      if (slots_[slot].headAndState_.compare_exchange_strong(prev, after)) {
+        // success
+        if (idx != slot)
+          slots_[idx].stateUpdate(CONSTRUCTING, LINKED);
+        return std::make_pair(ConstIterator(*this, idx), true);
+      }
+      // compare_exchange_strong updates its first arg on failure, so
+      // there is no need to reread prev
+
+      existing = find(key, slot);
+      if (existing) {
+        // our allocated key and value are no longer needed
+        slots_[idx].keyValue().first.~Key();
+        slots_[idx].keyValue().second.~Value();
+        slots_[idx].stateUpdate(CONSTRUCTING, EMPTY);
+
+        return std::make_pair(ConstIterator(*this, existing), false);
+      }
+    }
+  }
+
+  template <class K, class V>
+  std::pair<const_iterator,bool> insert(const K& key, const V& value) {
+    return findOrConstruct(key, &hash_map::copyCtor<V>, &value);
+  }
+
+  const_iterator find(const Key& key) const {
+    return ConstIterator(*this, find(key, keyToSlotIdx(key)));
+  }
+
+  const_iterator cbegin() const {
+    IndexType slot = numSlots_ - 1;
+    while (slot > 0 && slots_[slot].state() != LINKED) {
+      --slot;
+    }
+    return ConstIterator(*this, slot);
+  }
+
+  const_iterator cend() const {
+    return ConstIterator(*this, 0);
+  }
+
+  // Add by orca.zhang@yahoo.com
+  void clear() {
+    for (size_t i = 1; i < numSlots_; ++i) {
+      slots_[i].~Slot();
+    }
+    memset(slots_, 0, mmapRequested_);
+    slots_[0].stateUpdate(EMPTY, CONSTRUCTING);
+  }
+
+  // Add by orca.zhang@yahoo.com
+  bool erase(const Key& key) const {
+    KeyEqual ke;
+    IndexType slot = keyToSlotIdx(key);
+    IndexType hs = slots_[slot].headAndState_.load(std::memory_order_acquire);
+    IndexType last_slot = 0;
+    for (IndexType idx = hs >> 2; idx != 0; idx = slots_[idx].next_) {
+      if (ke(key, slots_[idx].keyValue().first)) {
+        if (!last_slot)
+          slots_[slot].headAndState_ = (slots_[idx].next_ & (unsigned)-4) | (hs & 3);
+        else
+          slots_[last_slot].next_ = slots_[idx].next_;
+        slots_[idx].~Slot();
+        slots_[idx].stateUpdate(LINKED, EMPTY);
+        return true;
+      }
+      last_slot = idx;
+    }
+    return false;
+  }
+
+  private:
+    enum {
+      kMaxAllocationTries = 1000, // after this we throw
+    };
+
+    typedef IndexType BucketState;
+
+    enum {
+      EMPTY = 0,
+      CONSTRUCTING = 1,
+      LINKED = 2,
+    };
+
+    /// Lock-free insertion is easiest by prepending to collision chains.
+    /// A large chaining hash table takes two cache misses instead of
+    /// one, however.  Our solution is to colocate the bucket storage and
+    /// the head storage, so that even though we are traversing chains we
+    /// are likely to stay within the same cache line.  Just make sure to
+    /// traverse head before looking at any keys.  This strategy gives us
+    /// 32 bit pointers and fast iteration.
+    struct Slot {
+      /// The bottom two bits are the BucketState, the rest is the index
+      /// of the first bucket for the chain whose keys map to this slot.
+      /// When things are going well the head usually links to this slot,
+      /// but that doesn't always have to happen.
+      Atom<IndexType> headAndState_;
+
+      /// The next bucket in the chain
+      IndexType next_;
+
+      /// Key and Value
+      unsigned char raw_[sizeof(value_type)];
+
+      ~Slot() {
+        BucketState s = state();
+        assert(s == EMPTY || s == LINKED);
+        if (s == LINKED) {
+          keyValue().first.~Key();
+          keyValue().second.~Value();
+        }
+      }
+
+      BucketState state() const {
+        return BucketState(headAndState_.load(std::memory_order_acquire) & 3);
+      }
+
+      void stateUpdate(BucketState before, BucketState after) {
+        assert(state() == before);
+        headAndState_ += (after - before);
+      }
+
+      value_type& keyValue() {
+        assert(state() != EMPTY);
+        union {
+          unsigned char* p;
+          value_type* v;
+        } u;
+        u.p = raw_;
+        return *u.v;
+      }
+
+      const value_type& keyValue() const {
+        assert(state() != EMPTY);
+        union {
+          unsigned char* p;
+          value_type* v;
+        } u;
+        u.p = raw_;
+        return *u.v;
+      }
+
+    };
+
+    // We manually manage the slot memory so we can bypass initialization
+    // (by getting a zero-filled mmap chunk) and optionally destruction of
+    // the slots
+
+    size_t mmapRequested_;
+    size_t numSlots_;
+
+    /// tricky, see keyToSlodIdx
+    size_t slotMask_;
+
+    Allocator allocator_;
+    Slot* slots_;
+
+    IndexType keyToSlotIdx(const Key& key) const {
+      size_t h = hasher()(key);
+      h &= slotMask_;
+      while (h >= numSlots_) {
+        h -= numSlots_;
+      }
+      return h;
+    }
+
+    IndexType find(const Key& key, IndexType slot) const {
+      KeyEqual ke;
+      IndexType hs = slots_[slot].headAndState_.load(std::memory_order_acquire);
+      for (slot = hs >> 2; slot != 0; slot = slots_[slot].next_) {
+        if (ke(key, slots_[slot].keyValue().first)) {
+          return slot;
+        }
+      }
+      return 0;
+    }
+
+    /// Allocates a slot and returns its index.  Tries to put it near
+    /// slots_[start].
+    IndexType allocateNear(IndexType start) {
+      for (IndexType tries = 0; tries < kMaxAllocationTries; ++tries) {
+        IndexType slot = allocationAttempt(start, tries);
+        IndexType prev = slots_[slot].headAndState_.load(std::memory_order_acquire);
+        if ((prev & 3) == EMPTY &&
+          slots_[slot].headAndState_.compare_exchange_strong(
+            prev, prev + CONSTRUCTING - EMPTY)) {
+          return slot;
+        }
+      }
+      return 0; // return fake element rather than throw exception to ignore overflow
+      // throw std::bad_alloc();
+    }
+
+    /// Returns the slot we should attempt to allocate after tries failed
+    /// tries, starting from the specified slot.  This is pulled out so we
+    /// can specialize it differently during deterministic testing
+    IndexType allocationAttempt(IndexType start, IndexType tries) const {
+      if (LIKELY(tries < 8 && start + tries < numSlots_)) {
+        return IndexType(start + tries);
+      } else {
+        IndexType rv;
+        if (sizeof(IndexType) <= 4) {
+          rv = IndexType(rand() % numSlots_);
+        } else {
+          rv = IndexType(((int64_t(rand()) << 32) + rand()) % numSlots_);
+        }
+        assert(rv < numSlots_);
+        return rv;
+      }
+    }
+
+    template<typename V>
+    static void copyCtor(void* raw, const V* v) {
+      assert(v);
+      new (raw) Value(*v);
+    }
+  };
+
+  /// MutableAtom is a tiny wrapper than gives you the option of atomically
+  /// updating values inserted into an hash_map<K,
+  /// MutableAtom<V>>.  This relies on hash_map's guarantee
+  /// that it doesn't move values.
+  template <typename T, template <typename> class Atom = std::atomic>
+  struct MutableAtom {
+    mutable Atom<T> data;
+    explicit MutableAtom(const T& init) : data(init) {}
+  };
+
+  /// MutableData is a tiny wrapper than gives you the option of using an
+  /// external concurrency control mechanism to updating values inserted
+  /// into an hash_map.
+  template <typename T>
+  struct MutableData {
+    mutable T data;
+    explicit MutableData(const T& init) : data(init) {}
+  };
+
+  /**
+   * A very simple atomic single-linked list primitive.
+   *
+   * Usage:
+   *
+   * class MyClass {
+   *   _linked_list_hook<MyClass> hook_;
+   * }
+   *
+   * _linked_list<MyClass, &MyClass::hook_> list;
+   * list.insert(&a);
+   * list.sweep([] (MyClass* c) { doSomething(c); }
+   */
+  template <class T>
+  struct _linked_list_hook {
+    T* next{nullptr};
+  };
+
+  template <class T, _linked_list_hook<T> T::*HookMember>
+  class _linked_list {
+  public:
+    _linked_list() {}
+
+    _linked_list(const _linked_list&) = delete;
+    _linked_list& operator=(const _linked_list&) =
+        delete;
+
+    _linked_list(_linked_list&& other) noexcept
+        : head_(other.head_.exchange(nullptr, std::memory_order_acq_rel)) {}
+
+    // Absent because would be too error-prone to use correctly because of
+    // the requirement that lists are empty upon destruction.
+    _linked_list& operator=(
+        _linked_list&& other) noexcept = delete;
+
+    /**
+     * Move the currently held elements to a new list.
+     * The current list becomes empty, but concurrent threads
+     * might still add new elements to it.
+     *
+     * Equivalent to calling a move constructor, but more linter-friendly
+     * in case you still need the old list.
+     */
+    _linked_list spliceAll() { return std::move(*this); }
+
+    /**
+     * Move-assign the current list to `other`, then reverse-sweep
+     * the old list with the provided callback `func`.
+     *
+     * A safe replacement for the move assignment operator, which is absent
+     * because of the resource leak concerns.
+     */
+    template <typename F>
+    void reverseSweepAndAssign(_linked_list&& other, F&& func) {
+      auto otherHead = other.head_.exchange(nullptr, std::memory_order_acq_rel);
+      auto head = head_.exchange(otherHead, std::memory_order_acq_rel);
+      unlinkAll(head, std::forward<F>(func));
+    }
+
+    /**
+     * Note: The list must be empty on destruction.
+     */
+    ~_linked_list() { assert(empty()); }
+
+    /**
+     * Returns the current head of the list.
+     *
+     * WARNING: The returned pointer might not be valid if the list
+     * is modified concurrently!
+     */
+    T* unsafeHead() const { return head_.load(std::memory_order_acquire); }
+
+    /**
+     * Returns true if the list is empty.
+     *
+     * WARNING: This method's return value is only valid for a snapshot
+     * of the state, it might become stale as soon as it's returned.
+     */
+    bool empty() const { return unsafeHead() == nullptr; }
+
+    /**
+     * Atomically insert t at the head of the list.
+     * @return True if the inserted element is the only one in the list
+     *         after the call.
+     */
+    bool insertHead(T* t) {
+      assert(next(t) == nullptr);
+
+      auto oldHead = head_.load(std::memory_order_relaxed);
+      do {
+        next(t) = oldHead;
+        /* oldHead is updated by the call below.
+
+          NOTE: we don't use next(t) instead of oldHead directly due to
+          compiler bugs (GCC prior to 4.8.3 (bug 60272), clang (bug 18899),
+          MSVC (bug 819819); source:
+          http://en.cppreference.com/w/cpp/atomic/atomic/compare_exchange */
+      } while (!head_.compare_exchange_weak(
+          oldHead, t, std::memory_order_release, std::memory_order_relaxed));
+
+      return oldHead == nullptr;
+    }
+
+    /**
+     * Replaces the head with nullptr,
+     * and calls func() on the removed elements in the order from tail to head.
+     * Returns false if the list was empty.
+     */
+    template <typename F>
+    bool sweepOnce(F&& func) {
+      if (auto head = head_.exchange(nullptr, std::memory_order_acq_rel)) {
+        auto rhead = reverse(head);
+        unlinkAll(rhead, std::forward<F>(func));
+        return true;
+      }
+      return false;
+    }
+
+    /**
+     * Repeatedly replaces the head with nullptr,
+     * and calls func() on the removed elements in the order from tail to head.
+     * Stops when the list is empty.
+     */
+    template <typename F>
+    void sweep(F&& func) {
+      while (sweepOnce(func)) {
+      }
+    }
+
+    /**
+     * Similar to sweep() but calls func() on elements in LIFO order.
+     *
+     * func() is called for all elements in the list at the moment
+     * reverseSweep() is called.  Unlike sweep() it does not loop to ensure the
+     * list is empty at some point after the last invocation.  This way callers
+     * can reason about the ordering: elements inserted since the last call to
+     * reverseSweep() will be provided in LIFO order.
+     *
+     * Example: if elements are inserted in the order 1-2-3, the callback is
+     * invoked 3-2-1.  If the callback moves elements onto a stack, popping off
+     * the stack will produce the original insertion order 1-2-3.
+     */
+    template <typename F>
+    void reverseSweep(F&& func) {
+      // We don't loop like sweep() does because the overall order of callbacks
+      // would be strand-wise LIFO which is meaningless to callers.
+      auto head = head_.exchange(nullptr, std::memory_order_acq_rel);
+      unlinkAll(head, std::forward<F>(func));
+    }
+
+  private:
+    std::atomic<T*> head_{nullptr};
+
+    static T*& next(T* t) { return (t->*HookMember).next; }
+
+    /* Reverses a linked list, returning the pointer to the new head
+      (old tail) */
+    static T* reverse(T* head) {
+      T* rhead = nullptr;
+      while (head != nullptr) {
+        auto t = head;
+        head = next(t);
+        next(t) = rhead;
+        rhead = t;
+      }
+      return rhead;
+    }
+
+    /* Unlinks all elements in the linked list fragment pointed to by `head',
+    * calling func() on every element */
+    template <typename F>
+    static void unlinkAll(T* head, F&& func) {
+      while (head != nullptr) {
+        auto t = head;
+        head = next(t);
+        next(t) = nullptr;
+        func(t);
+      }
+    }
+  };
+
+  /**
+   * A very simple atomic single-linked list primitive.
+   *
+   * Usage:
+   *
+   * linked_list<MyClass> list;
+   * list.insert(a);
+   * list.sweep([] (MyClass& c) { doSomething(c); }
+   */
+
+  template <class T>
+  class linked_list {
+  public:
+    linked_list() {}
+    linked_list(const linked_list&) = delete;
+    linked_list& operator=(const linked_list&) = delete;
+    linked_list(linked_list&& other) noexcept = default;
+    linked_list& operator=(linked_list&& other) noexcept {
+      list_.reverseSweepAndAssign(std::move(other.list_), [](Wrapper* node) {
+        delete node;
+      });
+      return *this;
+    }
+
+    ~linked_list() {
+      sweep([](T&&) {});
+    }
+
+    bool empty() const { return list_.empty(); }
+
+    /**
+     * Atomically insert t at the head of the list.
+     * @return True if the inserted element is the only one in the list
+     *         after the call.
+     */
+    bool insertHead(T t) {
+      auto wrapper = std::make_unique<Wrapper>(std::move(t));
+
+      return list_.insertHead(wrapper.release());
+    }
+
+    /**
+     * Repeatedly pops element from head,
+     * and calls func() on the removed elements in the order from tail to head.
+     * Stops when the list is empty.
+     */
+    template <typename F>
+    void sweep(F&& func) {
+      list_.sweep([&](Wrapper* wrapperPtr) mutable {
+        std::unique_ptr<Wrapper> wrapper(wrapperPtr);
+
+        func(std::move(wrapper->data));
+      });
+    }
+
+    /**
+     * Sweeps the list a single time, as a single point in time swap with the
+     * current contents of the list.
+     *
+     * Unlike sweep() it does not loop to ensure the list is empty at some point
+     * after the last invocation.
+     *
+     * Returns false if the list is empty.
+     */
+    template <typename F>
+    bool sweepOnce(F&& func) {
+      return list_.sweepOnce([&](Wrapper* wrappedPtr) {
+        std::unique_ptr<Wrapper> wrapper(wrappedPtr);
+        func(std::move(wrapper->data));
+      });
+    }
+
+    /**
+     * Similar to sweep() but calls func() on elements in LIFO order.
+     *
+     * func() is called for all elements in the list at the moment
+     * reverseSweep() is called.  Unlike sweep() it does not loop to ensure the
+     * list is empty at some point after the last invocation.  This way callers
+     * can reason about the ordering: elements inserted since the last call to
+     * reverseSweep() will be provided in LIFO order.
+     *
+     * Example: if elements are inserted in the order 1-2-3, the callback is
+     * invoked 3-2-1.  If the callback moves elements onto a stack, popping off
+     * the stack will produce the original insertion order 1-2-3.
+     */
+    template <typename F>
+    void reverseSweep(F&& func) {
+      list_.reverseSweep([&](Wrapper* wrapperPtr) mutable {
+        std::unique_ptr<Wrapper> wrapper(wrapperPtr);
+
+        func(std::move(wrapper->data));
+      });
+    }
+
+  private:
+    struct Wrapper {
+      explicit Wrapper(T&& t) : data(std::move(t)) {}
+
+      _linked_list_hook<Wrapper> hook;
+      T data;
+    };
+    _linked_list<Wrapper, &Wrapper::hook> list_;
+  };
+
+} // namespace lock_free
\ No newline at end of file
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2306dc26fe4..2b634697dcc 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -18,6 +18,8 @@
 #include "index.html.gz.hpp"
 #include "loading.html.hpp"
 
+#include "lock-free.hpp"
+
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
@@ -514,7 +516,7 @@ struct server_task_result {
 };
 
 // using shared_ptr for polymorphism of server_task_result
-using server_task_result_ptr = std::unique_ptr<server_task_result>;
+using server_task_result_ptr = std::shared_ptr<server_task_result>;
 
 inline std::string stop_type_to_str(stop_type type) {
     switch (type) {
@@ -1489,12 +1491,15 @@ struct server_metrics {
 };
 
 struct server_queue {
-    int id = 0;
+    std::atomic<int> id = 0;
     bool running;
 
     // queues
-    std::deque<server_task> queue_tasks;
-    std::deque<server_task> queue_tasks_deferred;
+    lock_free::linked_list<server_task> queue_tasks;
+    lock_free::linked_list<server_task> queue_tasks_deferred;
+    std::atomic<int> n_queue_tasks_deferred = 0;
+
+    lock_free::hash_map<int, int> cancel_tasks = {10000};
 
     std::mutex mutex_tasks;
     std::condition_variable condition_tasks;
@@ -1505,17 +1510,13 @@ struct server_queue {
 
     // Add a new task to the end of the queue
     int post(server_task task, bool front = false) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         GGML_ASSERT(task.id != -1);
         // if this is cancel task make sure to clean up pending tasks
         if (task.type == SERVER_TASK_TYPE_CANCEL) {
-            cleanup_pending_task(task.id_target);
-        }
-        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
-        if (front) {
-            queue_tasks.push_front(std::move(task));
+            cancel_tasks.insert(task.id_target, task.id_target);
         } else {
-            queue_tasks.push_back(std::move(task));
+            QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
+            queue_tasks.insertHead(std::move(task));
         }
         condition_tasks.notify_one();
         return task.id;
@@ -1523,20 +1524,16 @@ struct server_queue {
 
     // multi-task version of post()
     int post(std::vector<server_task> & tasks, bool front = false) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         for (auto & task : tasks) {
             if (task.id == -1) {
                 task.id = id++;
             }
             // if this is cancel task make sure to clean up pending tasks
             if (task.type == SERVER_TASK_TYPE_CANCEL) {
-                cleanup_pending_task(task.id_target);
-            }
-            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
-            if (front) {
-                queue_tasks.push_front(std::move(task));
+                cancel_tasks.insert(task.id_target, task.id_target);
             } else {
-                queue_tasks.push_back(std::move(task));
+                QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
+                queue_tasks.insertHead(std::move(task));
             }
         }
         condition_tasks.notify_one();
@@ -1545,15 +1542,14 @@ struct server_queue {
 
     // Add a new task, but defer until one slot is available
     void defer(server_task task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         QUE_DBG("defer task, id = %d\n", task.id);
-        queue_tasks_deferred.push_back(std::move(task));
+        queue_tasks_deferred.insertHead(std::move(task));
+        n_queue_tasks_deferred++;
         condition_tasks.notify_one();
     }
 
     // Get the next id for creating a new task
     int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         int new_id = id++;
         return new_id;
     }
@@ -1570,17 +1566,17 @@ struct server_queue {
 
     // Call when the state of one slot is changed, it will move one task from deferred to main queue
     void pop_deferred_task() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         if (!queue_tasks_deferred.empty()) {
-            queue_tasks.emplace_back(std::move(queue_tasks_deferred.front()));
-            queue_tasks_deferred.pop_front();
+            queue_tasks_deferred.sweepOnce([&](server_task && task) {
+                queue_tasks.insertHead(std::move(task));
+            });
+            n_queue_tasks_deferred--;
         }
         condition_tasks.notify_one();
     }
 
     // end the start_loop routine
     void terminate() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
         running = false;
         condition_tasks.notify_all();
     }
@@ -1599,21 +1595,21 @@ struct server_queue {
             QUE_DBG("%s", "processing new tasks\n");
 
             while (true) {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
                 if (!running) {
                     QUE_DBG("%s", "terminate\n");
                     return;
                 }
                 if (queue_tasks.empty()) {
-                    lock.unlock();
                     break;
                 }
-                server_task task = queue_tasks.front();
-                queue_tasks.pop_front();
-                lock.unlock();
-
-                QUE_DBG("processing task, id = %d\n", task.id);
-                callback_new_task(std::move(task));
+                queue_tasks.sweepOnce([&](server_task && task) {
+                    QUE_DBG("processing task, id = %d\n", task.id);
+                    if (cancel_tasks.erase(task.id) > 0) {
+                        QUE_DBG("task id = %d is canceled\n", task.id);
+                        return;
+                    }
+                    callback_new_task(std::move(task));
+                });
             }
 
             // all tasks in the current loop is processed, slots data is now ready
@@ -1622,82 +1618,54 @@ struct server_queue {
             callback_update_slots();
 
             QUE_DBG("%s", "waiting for new tasks\n");
-            {
+            if (!running) {
+                QUE_DBG("%s", "terminate\n");
+                return;
+            }
+            if (queue_tasks.empty()) {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (!running) {
-                    QUE_DBG("%s", "terminate\n");
-                    return;
-                }
-                if (queue_tasks.empty()) {
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
-                }
+                condition_tasks.wait(lock, [&]{
+                    return (!queue_tasks.empty() || !running);
+                });
             }
         }
     }
-
-private:
-    void cleanup_pending_task(int id_target) {
-        // no need lock because this is called exclusively by post()
-        auto rm_func = [id_target](const server_task & task) {
-            return task.id_target == id_target;
-        };
-        queue_tasks.erase(
-            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
-            queue_tasks.end());
-        queue_tasks_deferred.erase(
-            std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
-            queue_tasks_deferred.end());
-    }
 };
 
 struct server_response {
     // for keeping track of all tasks waiting for the result
-    std::unordered_set<int> waiting_task_ids;
+    lock_free::hash_map<int, int> waiting_task_ids = {10000};
 
     // the main result queue (using ptr for polymorphism)
-    std::vector<server_task_result_ptr> queue_results;
+    lock_free::hash_map<int, server_task_result_ptr> queue_results = {10000};
 
     std::mutex mutex_results;
     std::condition_variable condition_results;
 
     // add the id_task to the list of tasks waiting for response
     void add_waiting_task_id(int id_task) {
-        SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size());
-
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(id_task);
+        SRV_DBG("add task %d to waiting list. current no waiting = %d (before add)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1);
+        waiting_task_ids.insert(id_task, 0);
     }
 
     void add_waiting_tasks(const std::vector<server_task> & tasks) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-
         for (const auto & task : tasks) {
-            SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size());
-            waiting_task_ids.insert(task.id);
+            SRV_DBG("add task %d to waiting list. current no waiting = %d (before add)\n", task.id, queue_results.cbegin() == queue_results.cend() ? 0 : 1);
+            waiting_task_ids.insert(task.id, 0);
         }
     }
 
     // when the request is finished, we can remove task associated with it
     void remove_waiting_task_id(int id_task) {
-        SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
-
-        std::unique_lock<std::mutex> lock(mutex_results);
+        SRV_DBG("remove task %d from waiting list. current no waiting = %d (before remove)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1);
         waiting_task_ids.erase(id_task);
         // make sure to clean up all pending results
-        queue_results.erase(
-            std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
-                return res->id == id_task;
-            }),
-            queue_results.end());
+        queue_results.erase(id_task);
     }
 
     void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-
         for (const auto & id_task : id_tasks) {
-            SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size());
+            SRV_DBG("remove task %d from waiting list. current no waiting = %d (before remove)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1);
             waiting_task_ids.erase(id_task);
         }
     }
@@ -1705,18 +1673,19 @@ struct server_response {
     // This function blocks the thread until there is a response for one of the id_tasks
     server_task_result_ptr recv(const std::unordered_set<int> & id_tasks) {
         while (true) {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-
-            for (size_t i = 0; i < queue_results.size(); i++) {
-                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
-                    server_task_result_ptr res = std::move(queue_results[i]);
-                    queue_results.erase(queue_results.begin() + i);
+            for (const auto & id_task : id_tasks) {
+                auto iter = queue_results.find(id_task);
+                if (iter != queue_results.cend()) {
+                    server_task_result_ptr res = iter->second;
+                    queue_results.erase(id_task);
                     return res;
                 }
             }
+
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return queue_results.cbegin() != queue_results.cend();
+            });
         }
 
         // should never reach here
@@ -1726,16 +1695,16 @@ struct server_response {
     // if timeout is reached, nullptr is returned
     server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
         while (true) {
-            std::unique_lock<std::mutex> lock(mutex_results);
-
-            for (int i = 0; i < (int) queue_results.size(); i++) {
-                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
-                    server_task_result_ptr res = std::move(queue_results[i]);
-                    queue_results.erase(queue_results.begin() + i);
+            for (const auto & id_task : id_tasks) {
+                auto iter = queue_results.find(id_task);
+                if (iter != queue_results.cend()) {
+                    server_task_result_ptr res = iter->second;
+                    queue_results.erase(id_task);
                     return res;
                 }
             }
 
+            std::unique_lock<std::mutex> lock(mutex_results);
             std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
             if (cr_res == std::cv_status::timeout) {
                 return nullptr;
@@ -1744,26 +1713,34 @@ struct server_response {
 
         // should never reach here
     }
-
+ 
     // single-task version of recv()
     server_task_result_ptr recv(int id_task) {
-        std::unordered_set<int> id_tasks = {id_task};
-        return recv(id_tasks);
+        while (true) {
+            auto iter = queue_results.find(id_task);
+            if (iter != queue_results.cend()) {
+                server_task_result_ptr res = iter->second;
+                queue_results.erase(id_task);
+                return res;
+            }
+
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return queue_results.cbegin() != queue_results.cend();
+            });
+        }
     }
 
     // Send a new result to a waiting id_task
     void send(server_task_result_ptr && result) {
         SRV_DBG("sending result for task id = %d\n", result->id);
 
-        std::unique_lock<std::mutex> lock(mutex_results);
-        for (const auto & id_task : waiting_task_ids) {
-            if (result->id == id_task) {
-                SRV_DBG("task id = %d pushed to result queue\n", result->id);
+        if (waiting_task_ids.find(result->id) != waiting_task_ids.cend()) {
+            SRV_DBG("task id = %d pushed to result queue\n", result->id);
 
-                queue_results.emplace_back(std::move(result));
-                condition_results.notify_all();
-                return;
-            }
+            queue_results.insert(result->id, std::move(result));
+            condition_results.notify_all();
+            return;
         }
     }
 };
@@ -2622,7 +2599,7 @@ struct server_context {
                     res->slots_data          = std::move(slots_data);
                     res->n_idle_slots        = n_idle_slots;
                     res->n_processing_slots  = n_processing_slots;
-                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
+                    res->n_tasks_deferred    = queue_tasks.n_queue_tasks_deferred;
                     res->t_start             = metrics.t_start;
 
                     res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
@@ -3841,6 +3818,10 @@ int main(int argc, char ** argv) {
             // TODO: this log can become very long, put it behind a flag or think about a more compact format
             //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
 
+            if (prompt.contains("chat_history")) {
+                return;
+            }
+
             std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
             tasks.reserve(tokenized_prompts.size());
             for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4026,7 +4007,7 @@ int main(int argc, char ** argv) {
             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
     };
 
-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl, &res_ok](const httplib::Request & req, httplib::Response & res) {
         LOG_DBG("request: %s\n", req.body.c_str());
         if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index b1becccb4de..fdf7c8d1b4c 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -50,6 +50,9 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context
         case 128:
             ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst);
             break;
+        case 192:
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, ncols2>(ctx, dst);
+            break;
         case 256:
             ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst);
             break;
@@ -160,6 +163,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
 
     FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(192, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 #endif // GGML_CUDA_FA_ALL_QUANTS
 
@@ -235,6 +239,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
 
     FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(192, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 #endif // GGML_CUDA_FA_ALL_QUANTS
 
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index aba539e8dad..353a89589ee 100644
--- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu
@@ -25,6 +25,31 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
     }
 }
 
+static __global__ void pad_f16(const half * x, half * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
+    // blockIdx.y: idx of ne1
+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne00 +
+            blockIdx.z * ne00 * ne01;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
 static void pad_f32_cuda(const float * x, float * dst,
     const int ne00, const int ne01, const int ne02, const int ne03,
     const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
@@ -33,17 +58,33 @@ static void pad_f32_cuda(const float * x, float * dst,
     pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
 }
 
+static void pad_f16_cuda(const half * x, half * dst,
+    const int ne00, const int ne01, const int ne02, const int ne03,
+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
+    pad_f16<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+}
+
 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
     cudaStream_t stream = ctx.stream();
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == src0->type);
     GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
 
-    pad_f32_cuda(src0_d, dst_d,
-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+    if (src0->type == GGML_TYPE_F32) {
+        const float * src0_d = (const float *)src0->data;
+        float * dst_d = (float *)dst->data;
+        pad_f32_cuda(src0_d, dst_d,
+            src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+            dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+    } else {
+        const half * src0_d = (const half *)src0->data;
+        half * dst_d = (half *)dst->data;
+        pad_f16_cuda(src0_d, dst_d,
+            src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+            dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
+    }
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7fc06724ebd..206fd034133 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1155,6 +1155,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
 }
 
 size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] <= 0) {
+            return 0;
+        }
+    }
+
     size_t nbytes;
     const size_t blck_size = ggml_blck_size(tensor->type);
     if (blck_size == 1) {
diff --git a/include/llama.h b/include/llama.h
index 479196026b9..c9d823facb4 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -275,10 +275,18 @@ extern "C" {
         };
     };
 
+    struct llama_model_tensor_buft_override {
+        const char * pattern;
+        ggml_backend_buffer_type_t buft;
+    };
+
     struct llama_model_params {
         // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
         ggml_backend_dev_t * devices;
 
+        // NULL-terminated list of buffer types to use for tensors that match a pattern
+        const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index feffdf0de52..b5fbb3a25f0 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -32,7 +32,7 @@ bool llama_kv_cache_init(
 
     cache.recurrent = llama_model_is_recurrent(&model);
     cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
-    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+    cache.can_shift = !cache.recurrent; // not supported due to MLA
 
     LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
             __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 05d58ad90eb..0a70cea8f2e 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader(
         std::vector<std::string> & splits,
         bool use_mmap,
         bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p) {
+        const llama_model_kv_override * param_overrides_p,
+        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
     int trace = 0;
     if (getenv("LLAMA_TRACE")) {
         trace = atoi(getenv("LLAMA_TRACE"));
@@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader(
         }
     }
 
+    tensor_buft_overrides = param_tensor_buft_overrides_p;
+
     // Load the main GGUF
     struct ggml_context * ctx = NULL;
     struct gguf_init_params params = {
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index fe35404b268..0f52b011b69 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -77,8 +77,9 @@ struct llama_model_loader {
 
     llama_mmaps mappings;
 
-    std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
-    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
+    std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
+    std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
+    const llama_model_tensor_buft_override * tensor_buft_overrides;
 
     gguf_context_ptr meta;
     std::vector<ggml_context_ptr> contexts;
@@ -95,7 +96,8 @@ struct llama_model_loader {
         std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
         bool use_mmap,
         bool check_tensors,
-        const struct llama_model_kv_override * param_overrides_p);
+        const llama_model_kv_override * param_overrides_p,
+        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
 
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 36a0a009c45..338b678e059 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -11,6 +11,7 @@
 #include <cstring>
 #include <functional>
 #include <map>
+#include <regex>
 #include <sstream>
 #include <stdexcept>
 
@@ -1406,7 +1407,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 if (flags & TENSOR_NOT_REQUIRED) {
                     return nullptr;
                 }
-                throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
+                LLAMA_LOG_WARN("missing tensor info mapping for %s -- ignoring\n", tn.str().c_str());
+                return nullptr;
             }
 
             // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
@@ -1421,7 +1423,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             try {
                 info = llm_tensor_info_for(tn_tensor);
             } catch (const std::out_of_range & e) {
-                throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
+                LLAMA_LOG_WARN("missing tensor info mapping for %s -- ignoring\n", tn.str().c_str());
+                return nullptr;
             }
 
             // skip unused tensors
@@ -1468,9 +1471,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
             }
 
-            ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
+            ggml_backend_buffer_type_t buft = nullptr;
+
+            // check overrides
+            if (ml.tensor_buft_overrides) {
+                std::string tensor_name = tn.str();
+                for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
+                    std::regex pattern(overrides->pattern);
+                    if (std::regex_search(tensor_name, pattern)) {
+                        LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
+                        buft = overrides->buft;
+                        break;
+                    }
+                }
+            }
+
             if (!buft) {
-                throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+                buft = select_weight_buft(hparams, t_meta, op, *buft_list);
+                if (!buft) {
+                    throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+                }
             }
 
             // avoid using a host buffer when using mmap
@@ -3582,7 +3602,7 @@ size_t llama_model::size() const {
 }
 
 size_t llama_model::max_nodes() const {
-    return std::max<size_t>(8192, tensors_by_name.size()*5);
+    return std::max<size_t>(65536, tensors_by_name.size()*5);
 }
 
 size_t llama_model::n_devices() const {
@@ -3789,6 +3809,7 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
         /*.devices                     =*/ nullptr,
+        /*.tensor_buft_overrides       =*/ nullptr,
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb7982655a3..ab50c5d179a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     llama_model model(llama_model_default_params());
diff --git a/src/llama.cpp b/src/llama.cpp
index 607f2786159..e65d2cb9d15 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
     model.t_start_us = tm.t_start_us;
 
     try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
 
         ml.print_info();
 
@@ -582,7 +582,7 @@ static struct ggml_tensor * llm_build_kqv(
 
         // split cached v into n_head heads (not transposed)
         struct ggml_tensor * v =
-            ggml_view_3d(ctx, kv.v_l[il],
+        ggml_view_3d(ctx, kv.v_l[il],
                     n_embd_head_v, n_kv, n_head_kv,
                     ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
                     ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
@@ -6505,7 +6505,6 @@ struct llm_build_context {
                     ext_factor, attn_factor_scaled, beta_fast, beta_slow
                 );
                 cb(q_pe, "q_pe", il);
-
                 // shared RoPE key
                 k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                 k_pe = ggml_rope_ext(