Skip to content

Commit

Permalink
add additional hipblas conditions for cublas
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Jun 21, 2023
1 parent e1f9581 commit 222cbbb
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 36 deletions.
8 changes: 4 additions & 4 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
params.main_gpu = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
Expand All @@ -314,7 +314,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
std::string arg_next = argv[i];

// split string by , and /
Expand All @@ -334,7 +334,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
Expand Down Expand Up @@ -414,7 +414,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
exit(1);
}

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
exit(1);
Expand Down
6 changes: 3 additions & 3 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
std::string arg_next = argv[i];

// split string by , and /
Expand All @@ -583,7 +583,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
}
else if (arg == "--low-vram" || arg == "-lv")
{
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
Expand All @@ -594,7 +594,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
Expand Down
10 changes: 5 additions & 5 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ inline static void* ggml_aligned_malloc(size_t size) {
#endif
#elif defined(GGML_USE_OPENBLAS)
#include <cblas.h>
#elif defined(GGML_USE_CUBLAS) | defined(GGML_USE_HIPBLAS)
#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
Expand Down Expand Up @@ -4116,7 +4116,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}

#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST)
ggml_cl_init();
Expand Down Expand Up @@ -14875,7 +14875,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
GGML_ASSERT(params);

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
if (skip_cpu) {
return;
Expand Down Expand Up @@ -16362,7 +16362,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)

size_t cur = 0;

#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
Expand Down Expand Up @@ -18637,7 +18637,7 @@ int ggml_cpu_has_wasm_simd(void) {
}

int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST)
return 1;
#else
return 0;
Expand Down
2 changes: 1 addition & 1 deletion llama-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ struct llama_buffer {
llama_buffer& operator=(llama_buffer&&) = delete;
};

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
Expand Down
22 changes: 11 additions & 11 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "llama.h"

#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
Expand Down Expand Up @@ -175,7 +175,7 @@ struct llama_kv_cache {
ggml_free(ctx);
}

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
ggml_cuda_free_data(k);
ggml_cuda_free_data(v);
#endif // GGML_USE_CUBLAS
Expand Down Expand Up @@ -220,7 +220,7 @@ struct llama_model {
ggml_free(ctx);
}

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second);
}
Expand Down Expand Up @@ -791,7 +791,7 @@ struct llama_model_loader {
lmlock->grow_to(lock_size);
}
break;
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
case GGML_BACKEND_GPU:
case GGML_BACKEND_GPU_SPLIT:
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
Expand Down Expand Up @@ -911,7 +911,7 @@ static bool kv_cache_init(
ggml_set_name(cache.v, "cache_v");

(void) n_gpu_layers;
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
if (n_gpu_layers > n_layer + 1) {
ggml_cuda_assign_buffers_no_scratch(cache.v);
}
Expand Down Expand Up @@ -1141,7 +1141,7 @@ static void llama_model_load_internal(
}

(void) main_gpu;
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
Expand Down Expand Up @@ -1252,7 +1252,7 @@ static void llama_model_load_internal(

(void) vram_scratch;
(void) n_batch;
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
if (low_vram) {
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_cuda_set_scratch_size(0); // disable scratch
Expand All @@ -1265,7 +1265,7 @@ static void llama_model_load_internal(
}
}
#endif // GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
Expand Down Expand Up @@ -1305,7 +1305,7 @@ static void llama_model_load_internal(
}

(void) tensor_split;
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
{
ggml_cuda_set_tensor_split(tensor_split);
}
Expand Down Expand Up @@ -1425,7 +1425,7 @@ static bool llama_eval_internal(
offload_func_t offload_func_kq = llama_nop;
offload_func_t offload_func_v = llama_nop;

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers;
}
Expand All @@ -1440,7 +1440,7 @@ static bool llama_eval_internal(
for (int il = 0; il < n_layer; ++il) {
offload_func_t offload_func = llama_nop;

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
if (il >= i_gpu_start) {
offload_func = ggml_cuda_assign_buffers;
}
Expand Down
4 changes: 2 additions & 2 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define LLAMA_H

#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
#include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#else
Expand Down Expand Up @@ -38,7 +38,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1

#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif
Expand Down
14 changes: 7 additions & 7 deletions otherarch/ggml_v2.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ inline static void* ggml_v2_aligned_malloc(size_t size) {
#include <Accelerate/Accelerate.h>
#elif defined(GGML_USE_OPENBLAS)
#include <cblas.h>
#elif defined(GGML_USE_CUBLAS)
#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
#include "ggml_v2-cuda.h"
#endif
#if defined(GGML_USE_CLBLAST)
Expand Down Expand Up @@ -3894,7 +3894,7 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) {
GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}

#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
ggml_v2_init_cublas();
#elif defined(GGML_USE_CLBLAST)
if(quants_unshuffled)
Expand Down Expand Up @@ -9448,7 +9448,7 @@ static void ggml_v2_compute_forward_mul_mat_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
Expand Down Expand Up @@ -9642,7 +9642,7 @@ static void ggml_v2_compute_forward_mul_mat_f16_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
Expand Down Expand Up @@ -9881,7 +9881,7 @@ static void ggml_v2_compute_forward_mul_mat_q_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows

#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
Expand Down Expand Up @@ -14061,7 +14061,7 @@ void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph *

size_t cur = 0;

#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
Expand Down Expand Up @@ -15559,7 +15559,7 @@ int ggml_v2_cpu_has_wasm_simd(void) {
}

int ggml_v2_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST)
return 1;
#else
return 0;
Expand Down
2 changes: 1 addition & 1 deletion otherarch/llama_v2-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ struct llama_v2_buffer {
llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
};

#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
#include "ggml_v2-cuda.h"
struct llama_v2_ctx_buffer {
uint8_t * addr = NULL;
Expand Down
4 changes: 2 additions & 2 deletions otherarch/llama_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "llama_v2.h"

#include "ggml_v2.h"
#ifdef GGML_USE_CUBLAS
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
#include "ggml_v2-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml_v2-opencl.h"
Expand Down Expand Up @@ -3088,4 +3088,4 @@ std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const
res.resize(n);

return res;
}
}

0 comments on commit 222cbbb

Please sign in to comment.