From af5415fae74d3e06e8a94e2bd87088e382c5299a Mon Sep 17 00:00:00 2001 From: Locria Cyber <74560659+iacore@users.noreply.github.com> Date: Thu, 6 Apr 2023 18:22:44 +0000 Subject: [PATCH] Add loader stub for GGJT --- ggml/src/lib.rs | 6 +- llama-rs/src/lib.rs | 344 +++++------------------------------------ llama-rs/src/loader.rs | 310 +++++++++++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+), 310 deletions(-) create mode 100644 llama-rs/src/loader.rs diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs index 9b36bd7d..700a02e1 100644 --- a/ggml/src/lib.rs +++ b/ggml/src/lib.rs @@ -16,8 +16,10 @@ use std::{ pub use ggml_sys::ggml_type as Type; -/// Magic constant for `ggml` files (versioned). -pub const FILE_MAGIC: u32 = 0x67676d66; +/// Magic constant for `ggml` files (versioned, ggmf). +pub const FILE_MAGIC_GGMF: u32 = 0x67676d66; +/// Magic constant for `ggml` files (versioned, ggjt). +pub const FILE_MAGIC_GGJT: u32 = 0x67676a74; /// Magic constant for `ggml` files (unversioned). pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c; diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index 111a1c56..92f0d3a6 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -2,6 +2,8 @@ //! LLaMA-rs is a Rust port of the llama.cpp project. This allows running inference for Facebook's LLaMA model on a CPU with good performance using full precision, f16 or 4-bit quantized versions of the model. +mod loader; + use core::slice; use std::{ collections::HashMap, @@ -577,6 +579,7 @@ impl Model { n_context_tokens: usize, load_progress_callback: impl Fn(LoadProgress), ) -> Result<(Model, Vocabulary), LoadError> { + use loader::*; use std::fs::File; use std::io::BufReader; @@ -590,46 +593,11 @@ impl Model { })?, ); - fn read_bytes(reader: &mut impl BufRead) -> Result<[u8; N], LoadError> { - let mut bytes = [0u8; N]; - reader - .read_exact(&mut bytes) - .map_err(|e| LoadError::ReadExactFailed { - source: e, - bytes: N, - })?; - Ok(bytes) - } - - fn read_i32(reader: &mut impl BufRead) -> Result { - Ok(i32::from_le_bytes(read_bytes::<4>(reader)?)) - } - - fn read_u32(reader: &mut impl BufRead) -> Result { - Ok(u32::from_le_bytes(read_bytes::<4>(reader)?)) - } - - fn read_f32(reader: &mut impl BufRead) -> Result { - Ok(f32::from_le_bytes(read_bytes::<4>(reader)?)) - } - - /// Helper function. Reads a string from the buffer and returns it. - fn read_string(reader: &mut BufReader, len: usize) -> Result { - let mut buf = vec![0; len]; - reader - .read_exact(&mut buf) - .map_err(|e| LoadError::ReadExactFailed { - source: e, - bytes: buf.len(), - })?; - let s = String::from_utf8(buf)?; - Ok(s) - } - // Verify magic - let is_legacy_model: bool = match read_u32(&mut reader)? { - ggml::FILE_MAGIC => false, - ggml::FILE_MAGIC_UNVERSIONED => true, + let model_type: ModelType = match read_u32(&mut reader)? { + ggml::FILE_MAGIC_GGMF => ModelType::GGMF, + ggml::FILE_MAGIC_GGJT => ModelType::GGJT, + ggml::FILE_MAGIC_UNVERSIONED => ModelType::Unversioned, _ => { return Err(LoadError::InvalidMagic { path: main_path.to_owned(), @@ -638,12 +606,14 @@ impl Model { }; // Load format version - if !is_legacy_model { - #[allow(unused_variables)] - let version: u32 = match read_u32(&mut reader)? { - ggml::FORMAT_VERSION => ggml::FORMAT_VERSION, - version => return Err(LoadError::InvalidFormatVersion { value: version }), - }; + match model_type { + ModelType::GGMF | ModelType::GGJT => { + let _version: u32 = match read_u32(&mut reader)? { + ggml::FORMAT_VERSION => ggml::FORMAT_VERSION, + version => return Err(LoadError::InvalidFormatVersion { value: version }), + }; + } + ModelType::Unversioned => {} } // ================= @@ -678,8 +648,12 @@ impl Model { let mut max_token_length = 0; for i in 0..hparams.n_vocab { - let len = read_i32(&mut reader)?; - if let Ok(word) = read_string(&mut reader, len as usize) { + let len = match model_type { + // `read_i32` maybe a typo + ModelType::GGMF | ModelType::Unversioned => read_i32(&mut reader)? as usize, + ModelType::GGJT => read_u32(&mut reader)? as usize, + }; + if let Ok(word) = read_string(&mut reader, len) { max_token_length = max_token_length.max(word.len()); id_to_token.push(word.clone()); token_to_id.insert(word, TokenId::try_from(i)?); @@ -689,13 +663,16 @@ impl Model { } // Token score, currently unused - if !is_legacy_model { - if let Ok(score) = read_f32(&mut reader) { - id_to_token_score.push(score); + match model_type { + ModelType::GGMF | ModelType::GGJT => { + if let Ok(score) = read_f32(&mut reader) { + id_to_token_score.push(score); + } + } + ModelType::Unversioned => { + // Legacy model, set empty score + id_to_token_score.push(0.); } - } else { - // Legacy model, set empty score - id_to_token_score.push(0.); } } @@ -822,262 +799,13 @@ impl Model { } }; - // Close the file, but keep its offset. That way we know how to skip the - // metadata when loading the parts. - let file_offset = reader.stream_position()?; - drop(reader); - - let paths = { - let main_filename = main_path.file_name().and_then(|p| p.to_str()); - - let mut paths: Vec = - std::fs::read_dir(main_path.parent().ok_or_else(|| LoadError::NoParentPath { - path: main_path.to_owned(), - })?)? - .filter_map(Result::ok) - .map(|de| de.path()) - .filter(|p| { - p.file_name() - .and_then(|p| p.to_str()) - .zip(main_filename) - .map(|(part_filename, main_filename)| { - part_filename.starts_with(main_filename) - }) - .unwrap_or(false) - }) - .collect(); - paths.sort(); - paths - }; - - let n_parts = paths.len(); - - for (i, part_path) in paths.into_iter().enumerate() { - let part_id = i; - - load_progress_callback(LoadProgress::PartLoading { - file: &part_path, - current_part: i, - total_parts: n_parts, - }); - - let mut part_reader = BufReader::new(File::open(&part_path)?); - - // Skip metadata - part_reader.seek(SeekFrom::Start(file_offset))?; - - let mut total_size = 0; - let mut n_tensors = 0; - - // Load weights - loop { - // NOTE: Implementation from #![feature(buf_read_has_data_left)] - let is_eof = part_reader.fill_buf().map(|b| b.is_empty())?; - - if is_eof { - break; - } - - let n_dims = usize::try_from(read_i32(&mut part_reader)?)?; - let length = read_i32(&mut part_reader)?; - let ftype = read_u32(&mut part_reader)?; - - let mut nelements = 1; - let mut ne = [1i64, 1i64]; - - #[allow(clippy::needless_range_loop)] - for i in 0..n_dims { - ne[i] = read_i32(&mut part_reader)? as i64; - nelements *= usize::try_from(ne[i])?; - } - - let tensor_name = read_string(&mut part_reader, length as usize)?; - - let Some(tensor) = model.tensors.get(&tensor_name) - else { - return Err(LoadError::UnknownTensor { tensor_name, path: part_path }); - }; - - // split_type = 0: split by columns - // split_type = 1: split by rows - // - // split_type = 0: - // regex: - // - tok_embeddings.* - // - layers.*.attention.wo.weight - // - layers.*.feed_forward.w2.weight - - // split_type = 1: - // regex: - // - output.* - // - layers.*.attention.wq.weight - // - layers.*.attention.wk.weight - // - layers.*.attention.wv.weight - // - layers.*.feed_forward.w1.weight - // - layers.*.feed_forward.w3.weight - #[allow(clippy::if_same_then_else)] - let split_type = if tensor_name.contains("tok_embeddings") { - 0 - } else if tensor_name.contains("layers") { - if tensor_name.contains("attention.wo.weight") { - 0 - } else if tensor_name.contains("feed_forward.w2.weight") { - 0 - } else { - 1 - } - } else if tensor_name.contains("output") { - 1 - } else { - 0 - }; - - if n_dims == 1 { - if tensor.nelements() != nelements { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - } else if tensor.nelements() / n_parts != nelements { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - - if n_dims == 1 { - if tensor.get_ne()[0] != ne[0] || tensor.get_ne()[1] != ne[1] { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - } else if split_type == 0 { - if tensor.get_ne()[0] / i64::try_from(n_parts)? != ne[0] - || tensor.get_ne()[1] != ne[1] - { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - } else if tensor.get_ne()[0] != ne[0] - || tensor.get_ne()[1] / i64::try_from(n_parts)? != ne[1] - { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - - let bpe = match ftype { - 0 => ggml::type_size(ggml::TYPE_F32), - 1 => ggml::type_size(ggml::TYPE_F16), - 2 => { - assert_eq!(ne[0] % 64, 0); - ggml::type_size(ggml::TYPE_Q4_0) - } - 3 => { - assert_eq!(ne[0] % 64, 0); - ggml::type_size(ggml::TYPE_Q4_1) - } - _ => { - return Err(LoadError::InvalidFtype { - tensor_name, - ftype, - path: part_path, - }) - } - }; - - if n_dims == 1 || n_parts == 1 { - if (nelements * bpe) / ggml::blck_size(tensor.get_type()) != tensor.nbytes() { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - - if part_id == 0 { - // SAFETY: yolo, same as original code - let slice = unsafe { - let data = tensor.data(); - std::slice::from_raw_parts_mut(data as *mut u8, tensor.nbytes()) - }; - part_reader.read_exact(slice)?; - } else { - part_reader.seek(SeekFrom::Current(tensor.nbytes() as i64))?; - } - - total_size += tensor.nbytes(); - } else { - if (nelements * bpe) / ggml::blck_size(tensor.get_type()) - != tensor.nbytes() / n_parts - { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - - if split_type == 0 { - let np0 = ne[0]; - let row_size = (usize::try_from(tensor.get_ne()[0])? - / ggml::blck_size(tensor.get_type())) - * ggml::type_size(tensor.get_type()); - - assert_eq!(row_size, tensor.get_nb()[1]); - - for i1 in 0..ne[1] { - let offset_row = i1 as usize * row_size; - let offset = offset_row - + ((part_id * np0 as usize) / ggml::blck_size(tensor.get_type())) - * ggml::type_size(tensor.get_type()); - // SAFETY: yolo, same as original code - unsafe { - let ptr = tensor.data().add(offset); - let slice = std::slice::from_raw_parts_mut( - ptr as *mut u8, - row_size / n_parts, - ); - part_reader.read_exact(slice)?; - } - } - } else { - let np1 = ne[1]; - let row_size = (usize::try_from(tensor.get_ne()[0])? - / ggml::blck_size(tensor.get_type())) - * ggml::type_size(tensor.get_type()); - - for i1 in 0..ne[1] { - let offset_row = (i1 as usize + part_id * np1 as usize) * row_size; - // SAFETY: yolo, same as original code - unsafe { - let ptr = tensor.data().add(offset_row); - let slice = - std::slice::from_raw_parts_mut(ptr as *mut u8, row_size); - part_reader.read_exact(slice)?; - } - } - } - - total_size += tensor.nbytes() / n_parts; - } - - n_tensors += 1; - load_progress_callback(LoadProgress::PartTensorLoaded { - file: &part_path, - current_tensor: n_tensors.try_into()?, - tensor_count: model.tensors.len(), - }); + match model_type { + ModelType::GGMF | ModelType::Unversioned => { + load_weights_ggmf_or_unversioned(reader, main_path, load_progress_callback, &model)? + } + ModelType::GGJT => { + load_weights_ggjt(reader, main_path, load_progress_callback, &model)? } - - load_progress_callback(LoadProgress::PartLoaded { - file: &part_path, - byte_size: total_size, - tensor_count: n_tensors.try_into()?, - }); } Ok((model, vocab)) diff --git a/llama-rs/src/loader.rs b/llama-rs/src/loader.rs new file mode 100644 index 00000000..f17c0641 --- /dev/null +++ b/llama-rs/src/loader.rs @@ -0,0 +1,310 @@ +use std::{fs::File, io::BufReader}; + +use crate::*; + +pub(crate) fn read_bytes(reader: &mut impl BufRead) -> Result<[u8; N], LoadError> { + let mut bytes = [0u8; N]; + reader + .read_exact(&mut bytes) + .map_err(|e| LoadError::ReadExactFailed { + source: e, + bytes: N, + })?; + Ok(bytes) +} + +pub(crate) fn read_i32(reader: &mut impl BufRead) -> Result { + Ok(i32::from_le_bytes(read_bytes::<4>(reader)?)) +} + +pub(crate) fn read_u32(reader: &mut impl BufRead) -> Result { + Ok(u32::from_le_bytes(read_bytes::<4>(reader)?)) +} + +pub(crate) fn read_f32(reader: &mut impl BufRead) -> Result { + Ok(f32::from_le_bytes(read_bytes::<4>(reader)?)) +} + +/// Helper function. Reads a string from the buffer and returns it. +pub(crate) fn read_string(reader: &mut BufReader, len: usize) -> Result { + let mut buf = vec![0; len]; + reader + .read_exact(&mut buf) + .map_err(|e| LoadError::ReadExactFailed { + source: e, + bytes: buf.len(), + })?; + let s = String::from_utf8(buf)?; + Ok(s) +} + +#[derive(PartialEq)] +pub(crate) enum ModelType { + GGMF, + GGJT, + Unversioned, +} + +pub(crate) fn load_weights_ggmf_or_unversioned( + mut reader: std::io::BufReader, + main_path: &Path, + load_progress_callback: impl Fn(LoadProgress), + model: &Model, +) -> Result<(), LoadError> { + let file_offset = reader.stream_position()?; + drop(reader); + let paths = { + let main_filename = main_path.file_name().and_then(|p| p.to_str()); + + let mut paths: Vec = + std::fs::read_dir(main_path.parent().ok_or_else(|| LoadError::NoParentPath { + path: main_path.to_owned(), + })?)? + .filter_map(Result::ok) + .map(|de| de.path()) + .filter(|p| { + p.file_name() + .and_then(|p| p.to_str()) + .zip(main_filename) + .map(|(part_filename, main_filename)| part_filename.starts_with(main_filename)) + .unwrap_or(false) + }) + .collect(); + paths.sort(); + paths + }; + let n_parts = paths.len(); + Ok(for (i, part_path) in paths.into_iter().enumerate() { + let part_id = i; + + load_progress_callback(LoadProgress::PartLoading { + file: &part_path, + current_part: i, + total_parts: n_parts, + }); + + let mut part_reader = BufReader::new(File::open(&part_path)?); + + // Skip metadata + part_reader.seek(SeekFrom::Start(file_offset))?; + + let mut total_size = 0; + let mut n_tensors = 0; + + // Load weights + loop { + // NOTE: Implementation from #![feature(buf_read_has_data_left)] + let is_eof = part_reader.fill_buf().map(|b| b.is_empty())?; + + if is_eof { + break; + } + + let n_dims = usize::try_from(read_i32(&mut part_reader)?)?; + let length = read_i32(&mut part_reader)?; + let ftype = read_u32(&mut part_reader)?; + + let mut nelements = 1; + let mut ne = [1i64, 1i64]; + + #[allow(clippy::needless_range_loop)] + for i in 0..n_dims { + ne[i] = read_i32(&mut part_reader)? as i64; + nelements *= usize::try_from(ne[i])?; + } + + let tensor_name = read_string(&mut part_reader, length as usize)?; + + let Some(tensor) = model.tensors.get(&tensor_name) + else { + return Err(LoadError::UnknownTensor { tensor_name, path: part_path }); + }; + + // split_type = 0: split by columns + // split_type = 1: split by rows + // + // split_type = 0: + // regex: + // - tok_embeddings.* + // - layers.*.attention.wo.weight + // - layers.*.feed_forward.w2.weight + + // split_type = 1: + // regex: + // - output.* + // - layers.*.attention.wq.weight + // - layers.*.attention.wk.weight + // - layers.*.attention.wv.weight + // - layers.*.feed_forward.w1.weight + // - layers.*.feed_forward.w3.weight + #[allow(clippy::if_same_then_else)] + let split_type = if tensor_name.contains("tok_embeddings") { + 0 + } else if tensor_name.contains("layers") { + if tensor_name.contains("attention.wo.weight") { + 0 + } else if tensor_name.contains("feed_forward.w2.weight") { + 0 + } else { + 1 + } + } else if tensor_name.contains("output") { + 1 + } else { + 0 + }; + + if n_dims == 1 { + if tensor.nelements() != nelements { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: part_path, + }); + } + } else if tensor.nelements() / n_parts != nelements { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: part_path, + }); + } + + if n_dims == 1 { + if tensor.get_ne()[0] != ne[0] || tensor.get_ne()[1] != ne[1] { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: part_path, + }); + } + } else if split_type == 0 { + if tensor.get_ne()[0] / i64::try_from(n_parts)? != ne[0] + || tensor.get_ne()[1] != ne[1] + { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: part_path, + }); + } + } else if tensor.get_ne()[0] != ne[0] + || tensor.get_ne()[1] / i64::try_from(n_parts)? != ne[1] + { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: part_path, + }); + } + + let bpe = match ftype { + 0 => ggml::type_size(ggml::TYPE_F32), + 1 => ggml::type_size(ggml::TYPE_F16), + 2 => { + assert_eq!(ne[0] % 64, 0); + ggml::type_size(ggml::TYPE_Q4_0) + } + 3 => { + assert_eq!(ne[0] % 64, 0); + ggml::type_size(ggml::TYPE_Q4_1) + } + _ => { + return Err(LoadError::InvalidFtype { + tensor_name, + ftype, + path: part_path, + }) + } + }; + + if n_dims == 1 || n_parts == 1 { + if (nelements * bpe) / ggml::blck_size(tensor.get_type()) != tensor.nbytes() { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: part_path, + }); + } + + if part_id == 0 { + // SAFETY: yolo, same as original code + let slice = unsafe { + let data = tensor.data(); + std::slice::from_raw_parts_mut(data as *mut u8, tensor.nbytes()) + }; + part_reader.read_exact(slice)?; + } else { + part_reader.seek(SeekFrom::Current(tensor.nbytes() as i64))?; + } + + total_size += tensor.nbytes(); + } else { + if (nelements * bpe) / ggml::blck_size(tensor.get_type()) + != tensor.nbytes() / n_parts + { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: part_path, + }); + } + + if split_type == 0 { + let np0 = ne[0]; + let row_size = (usize::try_from(tensor.get_ne()[0])? + / ggml::blck_size(tensor.get_type())) + * ggml::type_size(tensor.get_type()); + + assert_eq!(row_size, tensor.get_nb()[1]); + + for i1 in 0..ne[1] { + let offset_row = i1 as usize * row_size; + let offset = offset_row + + ((part_id * np0 as usize) / ggml::blck_size(tensor.get_type())) + * ggml::type_size(tensor.get_type()); + // SAFETY: yolo, same as original code + unsafe { + let ptr = tensor.data().add(offset); + let slice = + std::slice::from_raw_parts_mut(ptr as *mut u8, row_size / n_parts); + part_reader.read_exact(slice)?; + } + } + } else { + let np1 = ne[1]; + let row_size = (usize::try_from(tensor.get_ne()[0])? + / ggml::blck_size(tensor.get_type())) + * ggml::type_size(tensor.get_type()); + + for i1 in 0..ne[1] { + let offset_row = (i1 as usize + part_id * np1 as usize) * row_size; + // SAFETY: yolo, same as original code + unsafe { + let ptr = tensor.data().add(offset_row); + let slice = std::slice::from_raw_parts_mut(ptr as *mut u8, row_size); + part_reader.read_exact(slice)?; + } + } + } + + total_size += tensor.nbytes() / n_parts; + } + + n_tensors += 1; + load_progress_callback(LoadProgress::PartTensorLoaded { + file: &part_path, + current_tensor: n_tensors.try_into()?, + tensor_count: model.tensors.len(), + }); + } + + load_progress_callback(LoadProgress::PartLoaded { + file: &part_path, + byte_size: total_size, + tensor_count: n_tensors.try_into()?, + }); + }) +} + +pub(crate) fn load_weights_ggjt( + mut reader: std::io::BufReader, + main_path: &Path, + load_progress_callback: impl Fn(LoadProgress), + model: &Model, +) -> Result<(), LoadError> { + todo!("GGJT load weights"); +}