diff --git a/common/arg.cpp b/common/arg.cpp index 3c169b5b5f4..aa3090d4f1d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1,5 +1,6 @@ #include "arg.h" +#include "common.h" #include "log.h" #include "sampling.h" #include "chat.h" @@ -322,6 +323,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.kv_overrides.back().key[0] = 0; } + if (!params.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + if (params.reranking && params.embedding) { throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); } @@ -1615,6 +1620,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex exit(0); } )); + add_opt(common_arg( + {"--override-tensor", "-ot"}, "=,...", + "override tensor buffer type", [](common_params & params, const std::string & value) { + /* static */ std::map buft_list; + if (buft_list.empty()) { + // enumerate all the devices and add their buffer types to the list + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + } + + for (const auto & override : string_split(value, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + throw std::invalid_argument("invalid value"); + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + throw std::invalid_argument("unknown buffer type"); + } + // FIXME: this leaks memory + params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)}); + } + } + )); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", diff --git a/common/common.cpp b/common/common.cpp index d2b0d50e3ee..7a2da4e7195 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1082,15 +1082,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (!params.devices.empty()) { mparams.devices = params.devices.data(); } + if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1098,6 +1101,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.kv_overrides = params.kv_overrides.data(); } + if (params.tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = NULL; + } else { + GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = params.tensor_buft_overrides.data(); + } + return mparams; } diff --git a/common/common.h b/common/common.h index efe8e7f7965..83602b7ca1f 100644 --- a/common/common.h +++ b/common/common.h @@ -272,6 +272,7 @@ struct common_params { std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; + std::vector tensor_buft_overrides; bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) std::vector lora_adapters; // lora adapter path with user defined scale diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8b7c75d85a6..3db21d178ac 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5109,4 +5109,4 @@ def main() -> None: if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/examples/server/lock-free.hpp b/examples/server/lock-free.hpp new file mode 100644 index 00000000000..c0b872a0944 --- /dev/null +++ b/examples/server/lock-free.hpp @@ -0,0 +1,881 @@ +/* + hash_map -- Lock-Free Hash Map port from folly::AtomicUnorderedInsertMap for C++. + + Copyright (c) 2010-2017 + + This library is released under the MIT License. + Please see LICENSE file or visit https://github.com/ez8-co/atomic for details. + */ +#pragma once + +#include +#include +#include +#include +#include + +#ifdef _MSC_VER + #include + #define LIKELY(x) (x) + #define UNLIKELY(x) (x) +#else + #define LIKELY(x) (__builtin_expect((x), 1)) + #define UNLIKELY(x) (__builtin_expect((x), 0)) +#endif + +#if __cplusplus >= 201103L || _MSC_VER >= 1700 + #include +#else +namespace std { + + typedef enum memory_order { + memory_order_relaxed, + memory_order_consume, + memory_order_acquire, + memory_order_release, + memory_order_acq_rel, + memory_order_seq_cst + } memory_order; + +#ifdef _MSC_VER + template + struct interlocked {}; + + template + struct interlocked { + static inline T incre(T volatile* x) { + return static_cast(_InterlockedIncrement(reinterpret_cast(x))); + } + static inline T decre(T volatile* x) { + return static_cast(_InterlockedDecrement(reinterpret_cast(x))); + } + static inline T add(T volatile* x, T delta) { + return static_cast(_InterlockedExchangeAdd(reinterpret_cast(x), delta)); + } + static inline T compare_exchange(T volatile* x, const T new_val, const T expected_val) { + return static_cast( + _InterlockedCompareExchange(reinterpret_cast(x), + static_cast(new_val), static_cast(expected_val))); + } + static inline T exchange(T volatile* x, const T new_val) { + return static_cast( + _InterlockedExchange( + reinterpret_cast(x), static_cast(new_val))); + } + }; + + template + struct interlocked { + static inline T incre(T volatile* x) { +#ifdef WIN64 + return static_cast(_InterlockedIncrement64(reinterpret_cast(x))); +#else + return add(x, 1); +#endif // WIN64 + } + static inline T decre(T volatile* x) { +#ifdef WIN64 + return static_cast(_InterlockedDecrement64(reinterpret_cast(x))); +#else + return add(x, -1); +#endif // WIN64 + } + static inline T add(T volatile* x, T delta) { +#ifdef WIN64 + return static_cast(_InterlockedExchangeAdd64(reinterpret_cast(x), delta)); +#else + __int64 old_val, new_val; + do { + old_val = static_cast<__int64>(*x); + new_val = old_val + static_cast<__int64>(delta); + } while (_InterlockedCompareExchange64( + reinterpret_cast(x), new_val, old_val) != + old_val); + return static_cast(new_val); +#endif // WIN64 + } + static inline T compare_exchange(T volatile* x, const T new_val, const T expected_val) { + return static_cast( + _InterlockedCompareExchange64(reinterpret_cast(x), + static_cast(new_val), static_cast(expected_val))); + } + static inline T exchange(T volatile* x, const T new_val) { +#ifdef WIN64 + return static_cast( + _InterlockedExchange64(reinterpret_cast(x), + static_cast(new_val))); +#else + __int64 old_val; + do { + old_val = static_cast<__int64>(*x); + } while (_InterlockedCompareExchange64( + reinterpret_cast(x), new_val, old_val) != + old_val); + return static_cast(old_val); +#endif // WIN64 + } + }; + +#else + + template + struct hash {}; + + template<> + struct hash { + inline size_t operator()(size_t v) const { return v; } + }; + +#endif + + template + class atomic { + public: + atomic() : value_(static_cast(0)) {} + explicit atomic(const T value) : value_(value) {} + + T operator++() { + #ifdef _MSC_VER + return interlocked::incre(&value_); + #else + return __atomic_add_fetch(&value_, 1, __ATOMIC_SEQ_CST); + #endif + } + + T operator++(int) { + T v = load(); ++(*this); return v; + } + + T operator--() { + #ifdef _MSC_VER + return interlocked::decre(&value_); + #else + return __atomic_sub_fetch(&value_, 1, __ATOMIC_SEQ_CST); + #endif + } + + T operator+=(T v) { + #ifdef _MSC_VER + return interlocked::add(&value_, v); + #else + return __atomic_add_fetch(&value_, v, __ATOMIC_SEQ_CST); + #endif + } + + bool compare_exchange_strong(T& expected_val, T new_val, memory_order order = memory_order_seq_cst) { + #ifdef _MSC_VER + return expected_val == interlocked::compare_exchange(&value_, new_val, expected_val); + #else + return __atomic_compare_exchange_n(&value_, &expected_val, new_val, 0, order, __ATOMIC_SEQ_CST); + #endif + } + + void store(const T new_val, memory_order order = memory_order_seq_cst) { + #ifdef _MSC_VER + interlocked::exchange(&value_, new_val); + #else + __atomic_store_n(&value_, new_val, order); + #endif + } + + T load(memory_order order = memory_order_seq_cst) const { + #ifdef _MSC_VER + return interlocked::add(const_cast(&value_), 0); + #else + return __atomic_load_n(&value_, order); + #endif + } + + T operator=(const T new_value) { + store(new_value); + return new_value; + } + + operator T() const { + return load(); + } + + private: + volatile T value_; + }; +} +#endif + +/* +* Copyright 2013-present Facebook, Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +namespace lock_free { + + size_t nextPowTwo(size_t v) { + #ifdef _MSC_VER + unsigned long x = 0; + _BitScanForward(&x, v - 1); + #else + int x = __builtin_clzll(v - 1); + #endif + return v ? (size_t(1) << (v - 1 ? (((sizeof(unsigned long long) << 3) - 1) ^ x) + 1 : 0)) : 1; + } + + template < + typename Key, + typename Value, + typename Hash = std::hash, + typename KeyEqual = std::equal_to, + template class Atom = std::atomic, + typename IndexType = size_t, + typename Allocator = std::allocator > + + struct hash_map { + + typedef Key key_type; + typedef Value mapped_type; + typedef std::pair value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef Hash hasher; + typedef KeyEqual key_equal; + typedef const value_type& const_reference; + + typedef struct ConstIterator : public std::iterator { + ConstIterator(const hash_map& owner, IndexType slot) + : owner_(owner) + , slot_(slot) + {} + + const value_type& operator*() const { + return owner_.slots_[slot_].keyValue(); + } + + const value_type* operator->() const { + return &owner_.slots_[slot_].keyValue(); + } + + // pre-increment + const ConstIterator& operator++() { + while (slot_ > 0) { + --slot_; + if (owner_.slots_[slot_].state() == LINKED) { + break; + } + } + return *this; + } + + // post-increment + ConstIterator operator++(int /* dummy */) { + ConstIterator prev = *this; + ++*this; + return prev; + } + + bool operator==(const ConstIterator& rhs) const { + return slot_ == rhs.slot_; + } + bool operator!=(const ConstIterator& rhs) const { + return !(*this == rhs); + } + + private: + const hash_map& owner_; + IndexType slot_; + } const_iterator; + + friend ConstIterator; + + hash_map(size_t maxSize, + float maxLoadFactor = 0.8f, + const Allocator& alloc = Allocator()) + : allocator_(alloc) + { + size_t capacity = size_t(maxSize / (maxLoadFactor > 1.0f ? 1.0f : maxLoadFactor) + 128); + size_t avail = size_t(1) << (8 * sizeof(IndexType) - 2); + if (capacity > avail && maxSize < avail) { + // we'll do our best + capacity = avail; + } + if (capacity < maxSize || capacity > avail) { + throw std::invalid_argument( + "hash_map capacity must fit in IndexType with 2 bits " + "left over"); + } + + numSlots_ = capacity; + slotMask_ = nextPowTwo(capacity * 4) - 1; + mmapRequested_ = sizeof(Slot) * capacity; + slots_ = reinterpret_cast(allocator_.allocate(mmapRequested_)); + memset(slots_, 0, mmapRequested_); + // mark the zero-th slot as in-use but not valid, since that happens + // to be our nil value + slots_[0].stateUpdate(EMPTY, CONSTRUCTING); + } + + ~hash_map() { + for (size_t i = 1; i < numSlots_; ++i) { + slots_[i].~Slot(); + } + allocator_.deallocate(reinterpret_cast(slots_), mmapRequested_); + } + + template + std::pair findOrConstruct(const Key& key, Func func, const V* value) { + IndexType const slot = keyToSlotIdx(key); + IndexType prev = slots_[slot].headAndState_.load(std::memory_order_acquire); + + IndexType existing = find(key, slot); + if (existing) + return std::make_pair(ConstIterator(*this, existing), false); + + IndexType idx = allocateNear(slot); + // allocaion failed, return fake element + if (!idx) + return std::make_pair(ConstIterator(*this, idx), false); + new (&slots_[idx].keyValue().first) Key(key); + func(static_cast(&slots_[idx].keyValue().second), value); + + while (true) { + slots_[idx].next_ = prev >> 2; + + // we can merge the head update and the CONSTRUCTING -> LINKED update + // into a single CAS if slot == idx (which should happen often) + IndexType after = idx << 2; + if (slot == idx) + after += LINKED; + else + after += (prev & 3); + + if (slots_[slot].headAndState_.compare_exchange_strong(prev, after)) { + // success + if (idx != slot) + slots_[idx].stateUpdate(CONSTRUCTING, LINKED); + return std::make_pair(ConstIterator(*this, idx), true); + } + // compare_exchange_strong updates its first arg on failure, so + // there is no need to reread prev + + existing = find(key, slot); + if (existing) { + // our allocated key and value are no longer needed + slots_[idx].keyValue().first.~Key(); + slots_[idx].keyValue().second.~Value(); + slots_[idx].stateUpdate(CONSTRUCTING, EMPTY); + + return std::make_pair(ConstIterator(*this, existing), false); + } + } + } + + template + std::pair insert(const K& key, const V& value) { + return findOrConstruct(key, &hash_map::copyCtor, &value); + } + + const_iterator find(const Key& key) const { + return ConstIterator(*this, find(key, keyToSlotIdx(key))); + } + + const_iterator cbegin() const { + IndexType slot = numSlots_ - 1; + while (slot > 0 && slots_[slot].state() != LINKED) { + --slot; + } + return ConstIterator(*this, slot); + } + + const_iterator cend() const { + return ConstIterator(*this, 0); + } + + // Add by orca.zhang@yahoo.com + void clear() { + for (size_t i = 1; i < numSlots_; ++i) { + slots_[i].~Slot(); + } + memset(slots_, 0, mmapRequested_); + slots_[0].stateUpdate(EMPTY, CONSTRUCTING); + } + + // Add by orca.zhang@yahoo.com + bool erase(const Key& key) const { + KeyEqual ke; + IndexType slot = keyToSlotIdx(key); + IndexType hs = slots_[slot].headAndState_.load(std::memory_order_acquire); + IndexType last_slot = 0; + for (IndexType idx = hs >> 2; idx != 0; idx = slots_[idx].next_) { + if (ke(key, slots_[idx].keyValue().first)) { + if (!last_slot) + slots_[slot].headAndState_ = (slots_[idx].next_ & (unsigned)-4) | (hs & 3); + else + slots_[last_slot].next_ = slots_[idx].next_; + slots_[idx].~Slot(); + slots_[idx].stateUpdate(LINKED, EMPTY); + return true; + } + last_slot = idx; + } + return false; + } + + private: + enum { + kMaxAllocationTries = 1000, // after this we throw + }; + + typedef IndexType BucketState; + + enum { + EMPTY = 0, + CONSTRUCTING = 1, + LINKED = 2, + }; + + /// Lock-free insertion is easiest by prepending to collision chains. + /// A large chaining hash table takes two cache misses instead of + /// one, however. Our solution is to colocate the bucket storage and + /// the head storage, so that even though we are traversing chains we + /// are likely to stay within the same cache line. Just make sure to + /// traverse head before looking at any keys. This strategy gives us + /// 32 bit pointers and fast iteration. + struct Slot { + /// The bottom two bits are the BucketState, the rest is the index + /// of the first bucket for the chain whose keys map to this slot. + /// When things are going well the head usually links to this slot, + /// but that doesn't always have to happen. + Atom headAndState_; + + /// The next bucket in the chain + IndexType next_; + + /// Key and Value + unsigned char raw_[sizeof(value_type)]; + + ~Slot() { + BucketState s = state(); + assert(s == EMPTY || s == LINKED); + if (s == LINKED) { + keyValue().first.~Key(); + keyValue().second.~Value(); + } + } + + BucketState state() const { + return BucketState(headAndState_.load(std::memory_order_acquire) & 3); + } + + void stateUpdate(BucketState before, BucketState after) { + assert(state() == before); + headAndState_ += (after - before); + } + + value_type& keyValue() { + assert(state() != EMPTY); + union { + unsigned char* p; + value_type* v; + } u; + u.p = raw_; + return *u.v; + } + + const value_type& keyValue() const { + assert(state() != EMPTY); + union { + unsigned char* p; + value_type* v; + } u; + u.p = raw_; + return *u.v; + } + + }; + + // We manually manage the slot memory so we can bypass initialization + // (by getting a zero-filled mmap chunk) and optionally destruction of + // the slots + + size_t mmapRequested_; + size_t numSlots_; + + /// tricky, see keyToSlodIdx + size_t slotMask_; + + Allocator allocator_; + Slot* slots_; + + IndexType keyToSlotIdx(const Key& key) const { + size_t h = hasher()(key); + h &= slotMask_; + while (h >= numSlots_) { + h -= numSlots_; + } + return h; + } + + IndexType find(const Key& key, IndexType slot) const { + KeyEqual ke; + IndexType hs = slots_[slot].headAndState_.load(std::memory_order_acquire); + for (slot = hs >> 2; slot != 0; slot = slots_[slot].next_) { + if (ke(key, slots_[slot].keyValue().first)) { + return slot; + } + } + return 0; + } + + /// Allocates a slot and returns its index. Tries to put it near + /// slots_[start]. + IndexType allocateNear(IndexType start) { + for (IndexType tries = 0; tries < kMaxAllocationTries; ++tries) { + IndexType slot = allocationAttempt(start, tries); + IndexType prev = slots_[slot].headAndState_.load(std::memory_order_acquire); + if ((prev & 3) == EMPTY && + slots_[slot].headAndState_.compare_exchange_strong( + prev, prev + CONSTRUCTING - EMPTY)) { + return slot; + } + } + return 0; // return fake element rather than throw exception to ignore overflow + // throw std::bad_alloc(); + } + + /// Returns the slot we should attempt to allocate after tries failed + /// tries, starting from the specified slot. This is pulled out so we + /// can specialize it differently during deterministic testing + IndexType allocationAttempt(IndexType start, IndexType tries) const { + if (LIKELY(tries < 8 && start + tries < numSlots_)) { + return IndexType(start + tries); + } else { + IndexType rv; + if (sizeof(IndexType) <= 4) { + rv = IndexType(rand() % numSlots_); + } else { + rv = IndexType(((int64_t(rand()) << 32) + rand()) % numSlots_); + } + assert(rv < numSlots_); + return rv; + } + } + + template + static void copyCtor(void* raw, const V* v) { + assert(v); + new (raw) Value(*v); + } + }; + + /// MutableAtom is a tiny wrapper than gives you the option of atomically + /// updating values inserted into an hash_map>. This relies on hash_map's guarantee + /// that it doesn't move values. + template class Atom = std::atomic> + struct MutableAtom { + mutable Atom data; + explicit MutableAtom(const T& init) : data(init) {} + }; + + /// MutableData is a tiny wrapper than gives you the option of using an + /// external concurrency control mechanism to updating values inserted + /// into an hash_map. + template + struct MutableData { + mutable T data; + explicit MutableData(const T& init) : data(init) {} + }; + + /** + * A very simple atomic single-linked list primitive. + * + * Usage: + * + * class MyClass { + * _linked_list_hook hook_; + * } + * + * _linked_list list; + * list.insert(&a); + * list.sweep([] (MyClass* c) { doSomething(c); } + */ + template + struct _linked_list_hook { + T* next{nullptr}; + }; + + template T::*HookMember> + class _linked_list { + public: + _linked_list() {} + + _linked_list(const _linked_list&) = delete; + _linked_list& operator=(const _linked_list&) = + delete; + + _linked_list(_linked_list&& other) noexcept + : head_(other.head_.exchange(nullptr, std::memory_order_acq_rel)) {} + + // Absent because would be too error-prone to use correctly because of + // the requirement that lists are empty upon destruction. + _linked_list& operator=( + _linked_list&& other) noexcept = delete; + + /** + * Move the currently held elements to a new list. + * The current list becomes empty, but concurrent threads + * might still add new elements to it. + * + * Equivalent to calling a move constructor, but more linter-friendly + * in case you still need the old list. + */ + _linked_list spliceAll() { return std::move(*this); } + + /** + * Move-assign the current list to `other`, then reverse-sweep + * the old list with the provided callback `func`. + * + * A safe replacement for the move assignment operator, which is absent + * because of the resource leak concerns. + */ + template + void reverseSweepAndAssign(_linked_list&& other, F&& func) { + auto otherHead = other.head_.exchange(nullptr, std::memory_order_acq_rel); + auto head = head_.exchange(otherHead, std::memory_order_acq_rel); + unlinkAll(head, std::forward(func)); + } + + /** + * Note: The list must be empty on destruction. + */ + ~_linked_list() { assert(empty()); } + + /** + * Returns the current head of the list. + * + * WARNING: The returned pointer might not be valid if the list + * is modified concurrently! + */ + T* unsafeHead() const { return head_.load(std::memory_order_acquire); } + + /** + * Returns true if the list is empty. + * + * WARNING: This method's return value is only valid for a snapshot + * of the state, it might become stale as soon as it's returned. + */ + bool empty() const { return unsafeHead() == nullptr; } + + /** + * Atomically insert t at the head of the list. + * @return True if the inserted element is the only one in the list + * after the call. + */ + bool insertHead(T* t) { + assert(next(t) == nullptr); + + auto oldHead = head_.load(std::memory_order_relaxed); + do { + next(t) = oldHead; + /* oldHead is updated by the call below. + + NOTE: we don't use next(t) instead of oldHead directly due to + compiler bugs (GCC prior to 4.8.3 (bug 60272), clang (bug 18899), + MSVC (bug 819819); source: + http://en.cppreference.com/w/cpp/atomic/atomic/compare_exchange */ + } while (!head_.compare_exchange_weak( + oldHead, t, std::memory_order_release, std::memory_order_relaxed)); + + return oldHead == nullptr; + } + + /** + * Replaces the head with nullptr, + * and calls func() on the removed elements in the order from tail to head. + * Returns false if the list was empty. + */ + template + bool sweepOnce(F&& func) { + if (auto head = head_.exchange(nullptr, std::memory_order_acq_rel)) { + auto rhead = reverse(head); + unlinkAll(rhead, std::forward(func)); + return true; + } + return false; + } + + /** + * Repeatedly replaces the head with nullptr, + * and calls func() on the removed elements in the order from tail to head. + * Stops when the list is empty. + */ + template + void sweep(F&& func) { + while (sweepOnce(func)) { + } + } + + /** + * Similar to sweep() but calls func() on elements in LIFO order. + * + * func() is called for all elements in the list at the moment + * reverseSweep() is called. Unlike sweep() it does not loop to ensure the + * list is empty at some point after the last invocation. This way callers + * can reason about the ordering: elements inserted since the last call to + * reverseSweep() will be provided in LIFO order. + * + * Example: if elements are inserted in the order 1-2-3, the callback is + * invoked 3-2-1. If the callback moves elements onto a stack, popping off + * the stack will produce the original insertion order 1-2-3. + */ + template + void reverseSweep(F&& func) { + // We don't loop like sweep() does because the overall order of callbacks + // would be strand-wise LIFO which is meaningless to callers. + auto head = head_.exchange(nullptr, std::memory_order_acq_rel); + unlinkAll(head, std::forward(func)); + } + + private: + std::atomic head_{nullptr}; + + static T*& next(T* t) { return (t->*HookMember).next; } + + /* Reverses a linked list, returning the pointer to the new head + (old tail) */ + static T* reverse(T* head) { + T* rhead = nullptr; + while (head != nullptr) { + auto t = head; + head = next(t); + next(t) = rhead; + rhead = t; + } + return rhead; + } + + /* Unlinks all elements in the linked list fragment pointed to by `head', + * calling func() on every element */ + template + static void unlinkAll(T* head, F&& func) { + while (head != nullptr) { + auto t = head; + head = next(t); + next(t) = nullptr; + func(t); + } + } + }; + + /** + * A very simple atomic single-linked list primitive. + * + * Usage: + * + * linked_list list; + * list.insert(a); + * list.sweep([] (MyClass& c) { doSomething(c); } + */ + + template + class linked_list { + public: + linked_list() {} + linked_list(const linked_list&) = delete; + linked_list& operator=(const linked_list&) = delete; + linked_list(linked_list&& other) noexcept = default; + linked_list& operator=(linked_list&& other) noexcept { + list_.reverseSweepAndAssign(std::move(other.list_), [](Wrapper* node) { + delete node; + }); + return *this; + } + + ~linked_list() { + sweep([](T&&) {}); + } + + bool empty() const { return list_.empty(); } + + /** + * Atomically insert t at the head of the list. + * @return True if the inserted element is the only one in the list + * after the call. + */ + bool insertHead(T t) { + auto wrapper = std::make_unique(std::move(t)); + + return list_.insertHead(wrapper.release()); + } + + /** + * Repeatedly pops element from head, + * and calls func() on the removed elements in the order from tail to head. + * Stops when the list is empty. + */ + template + void sweep(F&& func) { + list_.sweep([&](Wrapper* wrapperPtr) mutable { + std::unique_ptr wrapper(wrapperPtr); + + func(std::move(wrapper->data)); + }); + } + + /** + * Sweeps the list a single time, as a single point in time swap with the + * current contents of the list. + * + * Unlike sweep() it does not loop to ensure the list is empty at some point + * after the last invocation. + * + * Returns false if the list is empty. + */ + template + bool sweepOnce(F&& func) { + return list_.sweepOnce([&](Wrapper* wrappedPtr) { + std::unique_ptr wrapper(wrappedPtr); + func(std::move(wrapper->data)); + }); + } + + /** + * Similar to sweep() but calls func() on elements in LIFO order. + * + * func() is called for all elements in the list at the moment + * reverseSweep() is called. Unlike sweep() it does not loop to ensure the + * list is empty at some point after the last invocation. This way callers + * can reason about the ordering: elements inserted since the last call to + * reverseSweep() will be provided in LIFO order. + * + * Example: if elements are inserted in the order 1-2-3, the callback is + * invoked 3-2-1. If the callback moves elements onto a stack, popping off + * the stack will produce the original insertion order 1-2-3. + */ + template + void reverseSweep(F&& func) { + list_.reverseSweep([&](Wrapper* wrapperPtr) mutable { + std::unique_ptr wrapper(wrapperPtr); + + func(std::move(wrapper->data)); + }); + } + + private: + struct Wrapper { + explicit Wrapper(T&& t) : data(std::move(t)) {} + + _linked_list_hook hook; + T data; + }; + _linked_list list_; + }; + +} // namespace lock_free \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2306dc26fe4..2b634697dcc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -18,6 +18,8 @@ #include "index.html.gz.hpp" #include "loading.html.hpp" +#include "lock-free.hpp" + #include #include #include @@ -514,7 +516,7 @@ struct server_task_result { }; // using shared_ptr for polymorphism of server_task_result -using server_task_result_ptr = std::unique_ptr; +using server_task_result_ptr = std::shared_ptr; inline std::string stop_type_to_str(stop_type type) { switch (type) { @@ -1489,12 +1491,15 @@ struct server_metrics { }; struct server_queue { - int id = 0; + std::atomic id = 0; bool running; // queues - std::deque queue_tasks; - std::deque queue_tasks_deferred; + lock_free::linked_list queue_tasks; + lock_free::linked_list queue_tasks_deferred; + std::atomic n_queue_tasks_deferred = 0; + + lock_free::hash_map cancel_tasks = {10000}; std::mutex mutex_tasks; std::condition_variable condition_tasks; @@ -1505,17 +1510,13 @@ struct server_queue { // Add a new task to the end of the queue int post(server_task task, bool front = false) { - std::unique_lock lock(mutex_tasks); GGML_ASSERT(task.id != -1); // if this is cancel task make sure to clean up pending tasks if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - QUE_DBG("new task, id = %d, front = %d\n", task.id, front); - if (front) { - queue_tasks.push_front(std::move(task)); + cancel_tasks.insert(task.id_target, task.id_target); } else { - queue_tasks.push_back(std::move(task)); + QUE_DBG("new task, id = %d, front = %d\n", task.id, front); + queue_tasks.insertHead(std::move(task)); } condition_tasks.notify_one(); return task.id; @@ -1523,20 +1524,16 @@ struct server_queue { // multi-task version of post() int post(std::vector & tasks, bool front = false) { - std::unique_lock lock(mutex_tasks); for (auto & task : tasks) { if (task.id == -1) { task.id = id++; } // if this is cancel task make sure to clean up pending tasks if (task.type == SERVER_TASK_TYPE_CANCEL) { - cleanup_pending_task(task.id_target); - } - QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); - if (front) { - queue_tasks.push_front(std::move(task)); + cancel_tasks.insert(task.id_target, task.id_target); } else { - queue_tasks.push_back(std::move(task)); + QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); + queue_tasks.insertHead(std::move(task)); } } condition_tasks.notify_one(); @@ -1545,15 +1542,14 @@ struct server_queue { // Add a new task, but defer until one slot is available void defer(server_task task) { - std::unique_lock lock(mutex_tasks); QUE_DBG("defer task, id = %d\n", task.id); - queue_tasks_deferred.push_back(std::move(task)); + queue_tasks_deferred.insertHead(std::move(task)); + n_queue_tasks_deferred++; condition_tasks.notify_one(); } // Get the next id for creating a new task int get_new_id() { - std::unique_lock lock(mutex_tasks); int new_id = id++; return new_id; } @@ -1570,17 +1566,17 @@ struct server_queue { // Call when the state of one slot is changed, it will move one task from deferred to main queue void pop_deferred_task() { - std::unique_lock lock(mutex_tasks); if (!queue_tasks_deferred.empty()) { - queue_tasks.emplace_back(std::move(queue_tasks_deferred.front())); - queue_tasks_deferred.pop_front(); + queue_tasks_deferred.sweepOnce([&](server_task && task) { + queue_tasks.insertHead(std::move(task)); + }); + n_queue_tasks_deferred--; } condition_tasks.notify_one(); } // end the start_loop routine void terminate() { - std::unique_lock lock(mutex_tasks); running = false; condition_tasks.notify_all(); } @@ -1599,21 +1595,21 @@ struct server_queue { QUE_DBG("%s", "processing new tasks\n"); while (true) { - std::unique_lock lock(mutex_tasks); if (!running) { QUE_DBG("%s", "terminate\n"); return; } if (queue_tasks.empty()) { - lock.unlock(); break; } - server_task task = queue_tasks.front(); - queue_tasks.pop_front(); - lock.unlock(); - - QUE_DBG("processing task, id = %d\n", task.id); - callback_new_task(std::move(task)); + queue_tasks.sweepOnce([&](server_task && task) { + QUE_DBG("processing task, id = %d\n", task.id); + if (cancel_tasks.erase(task.id) > 0) { + QUE_DBG("task id = %d is canceled\n", task.id); + return; + } + callback_new_task(std::move(task)); + }); } // all tasks in the current loop is processed, slots data is now ready @@ -1622,82 +1618,54 @@ struct server_queue { callback_update_slots(); QUE_DBG("%s", "waiting for new tasks\n"); - { + if (!running) { + QUE_DBG("%s", "terminate\n"); + return; + } + if (queue_tasks.empty()) { std::unique_lock lock(mutex_tasks); - if (!running) { - QUE_DBG("%s", "terminate\n"); - return; - } - if (queue_tasks.empty()) { - condition_tasks.wait(lock, [&]{ - return (!queue_tasks.empty() || !running); - }); - } + condition_tasks.wait(lock, [&]{ + return (!queue_tasks.empty() || !running); + }); } } } - -private: - void cleanup_pending_task(int id_target) { - // no need lock because this is called exclusively by post() - auto rm_func = [id_target](const server_task & task) { - return task.id_target == id_target; - }; - queue_tasks.erase( - std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func), - queue_tasks.end()); - queue_tasks_deferred.erase( - std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func), - queue_tasks_deferred.end()); - } }; struct server_response { // for keeping track of all tasks waiting for the result - std::unordered_set waiting_task_ids; + lock_free::hash_map waiting_task_ids = {10000}; // the main result queue (using ptr for polymorphism) - std::vector queue_results; + lock_free::hash_map queue_results = {10000}; std::mutex mutex_results; std::condition_variable condition_results; // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size()); - - std::unique_lock lock(mutex_results); - waiting_task_ids.insert(id_task); + SRV_DBG("add task %d to waiting list. current no waiting = %d (before add)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1); + waiting_task_ids.insert(id_task, 0); } void add_waiting_tasks(const std::vector & tasks) { - std::unique_lock lock(mutex_results); - for (const auto & task : tasks) { - SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); - waiting_task_ids.insert(task.id); + SRV_DBG("add task %d to waiting list. current no waiting = %d (before add)\n", task.id, queue_results.cbegin() == queue_results.cend() ? 0 : 1); + waiting_task_ids.insert(task.id, 0); } } // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); - - std::unique_lock lock(mutex_results); + SRV_DBG("remove task %d from waiting list. current no waiting = %d (before remove)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1); waiting_task_ids.erase(id_task); // make sure to clean up all pending results - queue_results.erase( - std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) { - return res->id == id_task; - }), - queue_results.end()); + queue_results.erase(id_task); } void remove_waiting_task_ids(const std::unordered_set & id_tasks) { - std::unique_lock lock(mutex_results); - for (const auto & id_task : id_tasks) { - SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); + SRV_DBG("remove task %d from waiting list. current no waiting = %d (before remove)\n", id_task, queue_results.cbegin() == queue_results.cend() ? 0 : 1); waiting_task_ids.erase(id_task); } } @@ -1705,18 +1673,19 @@ struct server_response { // This function blocks the thread until there is a response for one of the id_tasks server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&]{ - return !queue_results.empty(); - }); - - for (size_t i = 0; i < queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); - queue_results.erase(queue_results.begin() + i); + for (const auto & id_task : id_tasks) { + auto iter = queue_results.find(id_task); + if (iter != queue_results.cend()) { + server_task_result_ptr res = iter->second; + queue_results.erase(id_task); return res; } } + + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return queue_results.cbegin() != queue_results.cend(); + }); } // should never reach here @@ -1726,16 +1695,16 @@ struct server_response { // if timeout is reached, nullptr is returned server_task_result_ptr recv_with_timeout(const std::unordered_set & id_tasks, int timeout) { while (true) { - std::unique_lock lock(mutex_results); - - for (int i = 0; i < (int) queue_results.size(); i++) { - if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - server_task_result_ptr res = std::move(queue_results[i]); - queue_results.erase(queue_results.begin() + i); + for (const auto & id_task : id_tasks) { + auto iter = queue_results.find(id_task); + if (iter != queue_results.cend()) { + server_task_result_ptr res = iter->second; + queue_results.erase(id_task); return res; } } + std::unique_lock lock(mutex_results); std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout)); if (cr_res == std::cv_status::timeout) { return nullptr; @@ -1744,26 +1713,34 @@ struct server_response { // should never reach here } - + // single-task version of recv() server_task_result_ptr recv(int id_task) { - std::unordered_set id_tasks = {id_task}; - return recv(id_tasks); + while (true) { + auto iter = queue_results.find(id_task); + if (iter != queue_results.cend()) { + server_task_result_ptr res = iter->second; + queue_results.erase(id_task); + return res; + } + + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return queue_results.cbegin() != queue_results.cend(); + }); + } } // Send a new result to a waiting id_task void send(server_task_result_ptr && result) { SRV_DBG("sending result for task id = %d\n", result->id); - std::unique_lock lock(mutex_results); - for (const auto & id_task : waiting_task_ids) { - if (result->id == id_task) { - SRV_DBG("task id = %d pushed to result queue\n", result->id); + if (waiting_task_ids.find(result->id) != waiting_task_ids.cend()) { + SRV_DBG("task id = %d pushed to result queue\n", result->id); - queue_results.emplace_back(std::move(result)); - condition_results.notify_all(); - return; - } + queue_results.insert(result->id, std::move(result)); + condition_results.notify_all(); + return; } } }; @@ -2622,7 +2599,7 @@ struct server_context { res->slots_data = std::move(slots_data); res->n_idle_slots = n_idle_slots; res->n_processing_slots = n_processing_slots; - res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); + res->n_tasks_deferred = queue_tasks.n_queue_tasks_deferred; res->t_start = metrics.t_start; res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); @@ -3841,6 +3818,10 @@ int main(int argc, char ** argv) { // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); + if (prompt.contains("chat_history")) { + return; + } + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); tasks.reserve(tokenized_prompts.size()); for (size_t i = 0; i < tokenized_prompts.size(); i++) { @@ -4026,7 +4007,7 @@ int main(int argc, char ** argv) { OAICOMPAT_TYPE_NONE); // infill is not OAI compatible }; - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl, &res_ok](const httplib::Request & req, httplib::Response & res) { LOG_DBG("request: %s\n", req.body.c_str()); if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index b1becccb4de..fdf7c8d1b4c 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -50,6 +50,9 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context case 128: ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst); break; + case 192: + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, ncols2>(ctx, dst); + break; case 256: ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst); break; @@ -160,6 +163,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F16_CASE(192, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) #endif // GGML_CUDA_FA_ALL_QUANTS @@ -235,6 +239,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16) + FATTN_VEC_F32_CASE(192, GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16) #endif // GGML_CUDA_FA_ALL_QUANTS diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu index aba539e8dad..353a89589ee 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -25,6 +25,31 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons } } +static __global__ void pad_f16(const half * x, half * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { + // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 + // blockIdx.y: idx of ne1 + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = + nidx + + blockIdx.y * ne0 + + blockIdx.z * ne0 * gridDim.y; + if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { + int offset_src = + nidx + + blockIdx.y * ne00 + + blockIdx.z * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = 0.0f; + } +} + static void pad_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03, const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { @@ -33,17 +58,33 @@ static void pad_f32_cuda(const float * x, float * dst, pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); } +static void pad_f16_cuda(const half * x, half * dst, + const int ne00, const int ne01, const int ne02, const int ne03, + const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne1, ne2*ne3); + pad_f16<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); +} + void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; cudaStream_t stream = ctx.stream(); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - pad_f32_cuda(src0_d, dst_d, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + if (src0->type == GGML_TYPE_F32) { + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + pad_f32_cuda(src0_d, dst_d, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + } else { + const half * src0_d = (const half *)src0->data; + half * dst_d = (half *)dst->data; + pad_f16_cuda(src0_d, dst_d, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); + } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7fc06724ebd..206fd034133 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1155,6 +1155,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) { } size_t ggml_nbytes(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] <= 0) { + return 0; + } + } + size_t nbytes; const size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { diff --git a/include/llama.h b/include/llama.h index 479196026b9..c9d823facb4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -275,10 +275,18 @@ extern "C" { }; }; + struct llama_model_tensor_buft_override { + const char * pattern; + ggml_backend_buffer_type_t buft; + }; + struct llama_model_params { // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) ggml_backend_dev_t * devices; + // NULL-terminated list of buffer types to use for tensors that match a pattern + const struct llama_model_tensor_buft_override * tensor_buft_overrides; + int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index feffdf0de52..b5fbb3a25f0 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -32,7 +32,7 @@ bool llama_kv_cache_init( cache.recurrent = llama_model_is_recurrent(&model); cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + cache.can_shift = !cache.recurrent; // not supported due to MLA LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 05d58ad90eb..0a70cea8f2e 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p) { + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader( } } + tensor_buft_overrides = param_tensor_buft_overrides_p; + // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index fe35404b268..0f52b011b69 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -77,8 +77,9 @@ struct llama_model_loader { llama_mmaps mappings; - std::map weights_map; - std::unordered_map kv_overrides; + std::map weights_map; + std::unordered_map kv_overrides; + const llama_model_tensor_buft_override * tensor_buft_overrides; gguf_context_ptr meta; std::vector contexts; @@ -95,7 +96,8 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p); + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 36a0a009c45..338b678e059 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -1406,7 +1407,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (flags & TENSOR_NOT_REQUIRED) { return nullptr; } - throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); + LLAMA_LOG_WARN("missing tensor info mapping for %s -- ignoring\n", tn.str().c_str()); + return nullptr; } // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops @@ -1421,7 +1423,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { try { info = llm_tensor_info_for(tn_tensor); } catch (const std::out_of_range & e) { - throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str())); + LLAMA_LOG_WARN("missing tensor info mapping for %s -- ignoring\n", tn.str().c_str()); + return nullptr; } // skip unused tensors @@ -1468,9 +1471,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) { GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); } - ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list); + ggml_backend_buffer_type_t buft = nullptr; + + // check overrides + if (ml.tensor_buft_overrides) { + std::string tensor_name = tn.str(); + for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { + std::regex pattern(overrides->pattern); + if (std::regex_search(tensor_name, pattern)) { + LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); + buft = overrides->buft; + break; + } + } + } + if (!buft) { - throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + buft = select_weight_buft(hparams, t_meta, op, *buft_list); + if (!buft) { + throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + } } // avoid using a host buffer when using mmap @@ -3582,7 +3602,7 @@ size_t llama_model::size() const { } size_t llama_model::max_nodes() const { - return std::max(8192, tensors_by_name.size()*5); + return std::max(65536, tensors_by_name.size()*5); } size_t llama_model::n_devices() const { @@ -3789,6 +3809,7 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const { struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.devices =*/ nullptr, + /*.tensor_buft_overrides =*/ nullptr, /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655a3..ab50c5d179a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 607f2786159..e65d2cb9d15 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -582,7 +582,7 @@ static struct ggml_tensor * llm_build_kqv( // split cached v into n_head heads (not transposed) struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], + ggml_view_3d(ctx, kv.v_l[il], n_embd_head_v, n_kv, n_head_kv, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), ggml_row_size(kv.v_l[il]->type, n_embd_head_v), @@ -6505,7 +6505,6 @@ struct llm_build_context { ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(q_pe, "q_pe", il); - // shared RoPE key k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this k_pe = ggml_rope_ext(