Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common/train.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "ggml.h"
#include "llama.h"

#define LLAMA_TRAIN_MAX_NODES 16384

typedef std::string mt19937_state;

struct train_state {
Expand Down
21 changes: 12 additions & 9 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);

// printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, m11xm2);

printf("n_threads=%i\n", benchmark_params.n_threads);

Expand All @@ -180,9 +181,9 @@ int main(int argc, char ** argv) {

std::vector<uint8_t> work_buffer;

ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);

TENSOR_DUMP(gf.nodes[0]);
TENSOR_DUMP(gf->nodes[0]);

printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));

Expand All @@ -200,7 +201,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);

// printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31);
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf31, q31);

// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
Expand All @@ -211,7 +213,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

//printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32);
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf32, q32);
printf("n_threads=%i\n", benchmark_params.n_threads);

const int dimx = sizex;
Expand All @@ -223,7 +226,7 @@ int main(int argc, char ** argv) {


// Let's use the F32 result from above as a reference for the quantized multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);

printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n");
Expand All @@ -233,7 +236,7 @@ int main(int argc, char ** argv) {

long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);

long long int stop = ggml_time_us();
long long int usec = stop-start;
Expand All @@ -251,7 +254,7 @@ int main(int argc, char ** argv) {

// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6

Expand All @@ -266,7 +269,7 @@ int main(int argc, char ** argv) {
}

// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
Expand Down
4 changes: 2 additions & 2 deletions examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
}

struct ggml_init_params params_ggml;
params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES;
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
params_ggml.mem_buffer = NULL;
params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml);
Expand Down Expand Up @@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;

struct ggml_init_params params;
params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
params.mem_buffer = NULL;
params.no_alloc = true;
struct ggml_context * ctx = NULL;
Expand Down
4 changes: 2 additions & 2 deletions examples/finetune/finetune.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1742,8 +1742,8 @@ int main(int argc, char ** argv) {

// context for compute tensors without their data
size_t estimated_compute_size_wo_data = (
ggml_tensor_overhead()*GGML_MAX_NODES*2
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
ggml_tensor_overhead()*LLAMA_TRAIN_MAX_NODES*2
+ (GGML_OBJECT_SIZE+ggml_graph_overhead())*(
params.common.use_checkpointing ? 3 : 2
)
);
Expand Down
2 changes: 1 addition & 1 deletion examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// measure mem requirement and allocate
{
static const size_t tensor_alignment = 32;
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
clip_image_f32_batch batch;
batch.size = 1;
Expand Down
10 changes: 5 additions & 5 deletions examples/metal/metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx_data = NULL;
struct ggml_context * ctx_eval = NULL;

struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);

// this allocates all Metal resources and memory buffers
auto * ctx_metal = ggml_metal_init(1);
Expand All @@ -46,21 +46,21 @@ int main(int argc, char ** argv) {

// main
{
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
*(int32_t *) input->data = 1; // BOS

ggml_metal_set_tensor(ctx_metal, input);

// warmup
ggml_metal_graph_compute(ctx_metal, &gf);
ggml_metal_graph_compute(ctx_metal, gf);

const int n_iter = 16;

const int64_t t0 = ggml_time_us();

// the actual inference happens here
for (int i = 0; i < n_iter; ++i) {
ggml_metal_graph_compute(ctx_metal, &gf);
ggml_metal_graph_compute(ctx_metal, gf);
}

const int64_t t1 = ggml_time_us();
Expand All @@ -70,7 +70,7 @@ int main(int argc, char ** argv) {

// debug output
{
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
ggml_metal_get_tensor(ctx_metal, logits);

float * ptr = (float *) ggml_get_data(logits);
Expand Down
4 changes: 2 additions & 2 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1109,8 +1109,8 @@ int main(int argc, char ** argv) {

// context for compute tensors without their data
size_t estimated_compute_size_wo_data = (
ggml_tensor_overhead()*GGML_MAX_NODES*2
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
ggml_tensor_overhead()*LLAMA_TRAIN_MAX_NODES*2
+ (GGML_OBJECT_SIZE+ggml_graph_overhead())*(
params.common.use_checkpointing ? 3 : 2
)
);
Expand Down
Loading