diff --git a/Cargo.lock b/Cargo.lock index 31ba9651..4c62603c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,6 +378,7 @@ dependencies = [ "bincode", "bytemuck", "ggml", + "memmap2", "partial_sort", "rand", "serde", @@ -406,6 +407,15 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memmap2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +dependencies = [ + "libc", +] + [[package]] name = "minimal-lexical" version = "0.2.1" diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs index 700a02e1..ba2b3f59 100644 --- a/ggml/src/lib.rs +++ b/ggml/src/lib.rs @@ -322,13 +322,21 @@ impl Tensor { /// # Safety /// /// The data must not be mutated while being read from. - pub unsafe fn data(&self) -> *mut c_void { + pub unsafe fn data(&self) -> *const c_void { self.with_alive_ctx(|| { // SAFETY: The with_alive_call guarantees the context is alive unsafe { *self.ptr.as_ptr() }.data }) } + /// Set the tensor's data pointer (useful for mmap-ed data) + pub unsafe fn set_data(&self, data_ptr: *mut c_void) { + self.with_alive_ctx(|| { + // SAFETY: The with_alive_call guarantees the context is alive + unsafe { *self.ptr.as_ptr() }.data = data_ptr; + }) + } + /// Number of elements in this tensor. pub fn nelements(&self) -> usize { self.with_alive_ctx(|| { diff --git a/llama-rs/Cargo.toml b/llama-rs/Cargo.toml index 76629b55..8802c585 100644 --- a/llama-rs/Cargo.toml +++ b/llama-rs/Cargo.toml @@ -16,3 +16,4 @@ rand = { workspace = true } serde = { version = "1.0.156", features = ["derive"] } serde_bytes = "0.11" bincode = "1.3.3" +memmap2 = "0.5.10" diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index 92f0d3a6..b6b95b3a 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -14,6 +14,7 @@ use std::{ time, }; +use memmap2::Mmap; use thiserror::Error; use partial_sort::PartialSort; @@ -66,6 +67,8 @@ pub struct Model { tensors: HashMap, + mmap: Option, + // Must be kept alive for the model _context: ggml::Context, } @@ -502,7 +505,7 @@ pub enum LoadError { /// The name of the tensor. tensor_name: String, /// The format type that was encountered. - ftype: u32, + ftype: i32, /// The path that failed. path: PathBuf, }, @@ -585,12 +588,13 @@ impl Model { let main_path = path.as_ref(); - let mut reader = - BufReader::new( - File::open(main_path).map_err(|e| LoadError::OpenFileFailed { + let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed { source: e, path: main_path.to_owned(), - })?, + })?; + let mut reader = + BufReader::new( + &file, ); // Verify magic @@ -732,7 +736,7 @@ impl Model { // Initialize the context let context = ggml::Context::init(ctx_size); - let model = { + let mut model = { let mut tensors = HashMap::new(); let tok_embeddings = context.new_tensor_2d(wtype, n_embd, n_vocab); @@ -796,15 +800,20 @@ impl Model { layers, tensors, _context: context, + mmap: None, } }; match model_type { ModelType::GGMF | ModelType::Unversioned => { - load_weights_ggmf_or_unversioned(reader, main_path, load_progress_callback, &model)? + let file_offset = reader.stream_position()?; + drop(reader); + load_weights_ggmf_or_unversioned(file_offset, main_path, load_progress_callback, &model)? } ModelType::GGJT => { - load_weights_ggjt(reader, main_path, load_progress_callback, &model)? + let mmap = unsafe { Mmap::map(&file)? }; + load_weights_ggjt(&mut reader, &mmap, main_path, load_progress_callback, &model)?; + model.mmap = Some(mmap); } } diff --git a/llama-rs/src/loader.rs b/llama-rs/src/loader.rs index f17c0641..793ba4fe 100644 --- a/llama-rs/src/loader.rs +++ b/llama-rs/src/loader.rs @@ -26,7 +26,7 @@ pub(crate) fn read_f32(reader: &mut impl BufRead) -> Result { } /// Helper function. Reads a string from the buffer and returns it. -pub(crate) fn read_string(reader: &mut BufReader, len: usize) -> Result { +pub(crate) fn read_string(reader: &mut impl BufRead, len: usize) -> Result { let mut buf = vec![0; len]; reader .read_exact(&mut buf) @@ -38,6 +38,11 @@ pub(crate) fn read_string(reader: &mut BufReader, len: usize) -> Result Result { + reader.fill_buf().map(|b| !b.is_empty()) +} + #[derive(PartialEq)] pub(crate) enum ModelType { GGMF, @@ -46,13 +51,11 @@ pub(crate) enum ModelType { } pub(crate) fn load_weights_ggmf_or_unversioned( - mut reader: std::io::BufReader, + file_offset: u64, main_path: &Path, load_progress_callback: impl Fn(LoadProgress), model: &Model, ) -> Result<(), LoadError> { - let file_offset = reader.stream_position()?; - drop(reader); let paths = { let main_filename = main_path.file_name().and_then(|p| p.to_str()); @@ -93,125 +96,23 @@ pub(crate) fn load_weights_ggmf_or_unversioned( // Load weights loop { - // NOTE: Implementation from #![feature(buf_read_has_data_left)] - let is_eof = part_reader.fill_buf().map(|b| b.is_empty())?; - - if is_eof { + if !has_data_left(&mut part_reader)? { break; } let n_dims = usize::try_from(read_i32(&mut part_reader)?)?; let length = read_i32(&mut part_reader)?; - let ftype = read_u32(&mut part_reader)?; - - let mut nelements = 1; - let mut ne = [1i64, 1i64]; - - #[allow(clippy::needless_range_loop)] - for i in 0..n_dims { - ne[i] = read_i32(&mut part_reader)? as i64; - nelements *= usize::try_from(ne[i])?; - } - - let tensor_name = read_string(&mut part_reader, length as usize)?; - - let Some(tensor) = model.tensors.get(&tensor_name) - else { - return Err(LoadError::UnknownTensor { tensor_name, path: part_path }); - }; - - // split_type = 0: split by columns - // split_type = 1: split by rows - // - // split_type = 0: - // regex: - // - tok_embeddings.* - // - layers.*.attention.wo.weight - // - layers.*.feed_forward.w2.weight - - // split_type = 1: - // regex: - // - output.* - // - layers.*.attention.wq.weight - // - layers.*.attention.wk.weight - // - layers.*.attention.wv.weight - // - layers.*.feed_forward.w1.weight - // - layers.*.feed_forward.w3.weight - #[allow(clippy::if_same_then_else)] - let split_type = if tensor_name.contains("tok_embeddings") { - 0 - } else if tensor_name.contains("layers") { - if tensor_name.contains("attention.wo.weight") { - 0 - } else if tensor_name.contains("feed_forward.w2.weight") { - 0 - } else { - 1 - } - } else if tensor_name.contains("output") { - 1 - } else { - 0 - }; - - if n_dims == 1 { - if tensor.nelements() != nelements { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - } else if tensor.nelements() / n_parts != nelements { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - - if n_dims == 1 { - if tensor.get_ne()[0] != ne[0] || tensor.get_ne()[1] != ne[1] { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - } else if split_type == 0 { - if tensor.get_ne()[0] / i64::try_from(n_parts)? != ne[0] - || tensor.get_ne()[1] != ne[1] - { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - } else if tensor.get_ne()[0] != ne[0] - || tensor.get_ne()[1] / i64::try_from(n_parts)? != ne[1] - { - return Err(LoadError::TensorWrongSize { - tensor_name, - path: part_path, - }); - } - - let bpe = match ftype { - 0 => ggml::type_size(ggml::TYPE_F32), - 1 => ggml::type_size(ggml::TYPE_F16), - 2 => { - assert_eq!(ne[0] % 64, 0); - ggml::type_size(ggml::TYPE_Q4_0) - } - 3 => { - assert_eq!(ne[0] % 64, 0); - ggml::type_size(ggml::TYPE_Q4_1) - } - _ => { - return Err(LoadError::InvalidFtype { - tensor_name, - ftype, - path: part_path, - }) - } - }; + let ftype = read_i32(&mut part_reader)?; + + let (nelements, ne, tensor_name, tensor, split_type, bpe) = load_tensor_header_ggmf( + n_dims, + &mut part_reader, + length, + model, + &part_path, + n_parts, + ftype, + )?; if n_dims == 1 || n_parts == 1 { if (nelements * bpe) / ggml::blck_size(tensor.get_type()) != tensor.nbytes() { @@ -300,11 +201,188 @@ pub(crate) fn load_weights_ggmf_or_unversioned( }) } +fn load_tensor_header_ggmf<'a>( + n_dims: usize, + reader: &mut BufReader, + length: i32, + model: &'a Model, + path: &Path, + n_parts: usize, + ftype: i32, +) -> Result<(usize, [i64; 2], String, &'a ggml::Tensor, i32, usize), LoadError> { + let mut nelements = 1; + let mut ne = [1i64, 1i64]; + assert!(n_dims <= ne.len()); + #[allow(clippy::needless_range_loop)] + for i in 0..n_dims { + ne[i] = read_i32(reader)? as i64; + nelements *= usize::try_from(ne[i])?; + } + let tensor_name = read_string(reader, length as usize)?; + let Some(tensor) = model.tensors.get(&tensor_name) + else { + return Err(LoadError::UnknownTensor { tensor_name, path: path.to_owned() }); + }; + #[allow(clippy::if_same_then_else)] + let split_type = if tensor_name.contains("tok_embeddings") { + 0 + } else if tensor_name.contains("layers") { + if tensor_name.contains("attention.wo.weight") { + 0 + } else if tensor_name.contains("feed_forward.w2.weight") { + 0 + } else { + 1 + } + } else if tensor_name.contains("output") { + 1 + } else { + 0 + }; + if n_dims == 1 { + if tensor.nelements() != nelements { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: path.to_owned(), + }); + } + } else if tensor.nelements() / n_parts != nelements { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: path.to_owned(), + }); + } + if n_dims == 1 { + if tensor.get_ne()[0] != ne[0] || tensor.get_ne()[1] != ne[1] { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: path.to_owned(), + }); + } + } else if split_type == 0 { + if tensor.get_ne()[0] / i64::try_from(n_parts)? != ne[0] || tensor.get_ne()[1] != ne[1] { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: path.to_owned(), + }); + } + } else if tensor.get_ne()[0] != ne[0] || tensor.get_ne()[1] / i64::try_from(n_parts)? != ne[1] { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: path.to_owned(), + }); + } + let bpe = tensor_type_size(ftype, ne); + let bpe = match bpe { + Some(x) => x, + None => { + return Err(LoadError::InvalidFtype { + tensor_name, + ftype, + path: path.to_owned(), + }); + } + }; + Ok((nelements, ne, tensor_name, tensor, split_type, bpe)) +} + +fn tensor_type_size(ftype: i32, ne: [i64; 2]) -> Option { + let bpe = match ftype { + 0 => Some(ggml::type_size(ggml::TYPE_F32)), + 1 => Some(ggml::type_size(ggml::TYPE_F16)), + 2 => { + assert_eq!(ne[0] % 64, 0); + Some(ggml::type_size(ggml::TYPE_Q4_0)) + } + 3 => { + assert_eq!(ne[0] % 64, 0); + Some(ggml::type_size(ggml::TYPE_Q4_1)) + } + _ => None, + }; + bpe +} + pub(crate) fn load_weights_ggjt( - mut reader: std::io::BufReader, - main_path: &Path, + reader: &mut std::io::BufReader<&File>, + mmap: &Mmap, + path: &Path, load_progress_callback: impl Fn(LoadProgress), model: &Model, -) -> Result<(), LoadError> { - todo!("GGJT load weights"); +) -> Result<(), LoadError> +// where R: std::io::Read +{ + let mut loop_i = 0; + let mut total_loaded_bytes = 0; + load_progress_callback(LoadProgress::PartLoading { + file: path, + current_part: 0, + total_parts: 1, + }); + + loop { + if !has_data_left(reader)? { + break; + } + + let n_dims = read_i32(reader)? as usize; + let length = read_i32(reader)?; + let ftype = read_i32(reader)?; + + let mut nelements: usize = 1; + let mut ne = [1i64, 1]; + assert!(n_dims <= ne.len()); + for i in 0..n_dims { + let dim = read_i32(reader)? as usize; + ne[i] = dim as i64; + nelements *= dim; + } + let tensor_name = read_string(reader, length as usize)?; + let Some(tensor) = model.tensors.get(&tensor_name) + else { + return Err(LoadError::UnknownTensor { tensor_name, path: path.to_owned() }); + }; + + if tensor.nelements() != nelements { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: path.to_owned(), + }); + } + let tensor_ne = tensor.get_ne(); + if tensor_ne[0] != ne[0] || tensor_ne[1] != ne[1] { + return Err(LoadError::TensorWrongSize { + tensor_name, + path: path.to_owned(), + }); + } + + _ = tensor_type_size(ftype, ne); + + let offset_curr = reader.stream_position()?; + let offset_aligned: u64 = (offset_curr + 31) & (31 ^ u64::MAX); + unsafe { + let ptr = mmap.as_ptr().offset(offset_aligned as isize); + tensor.set_data(ptr as *mut std::ffi::c_void); + } + let tensor_data_size = tensor.nbytes() as u64; + reader.seek(SeekFrom::Start(offset_aligned + tensor_data_size))?; + total_loaded_bytes += tensor_data_size; + + load_progress_callback(LoadProgress::PartTensorLoaded { + file: path, + current_tensor: loop_i, + tensor_count: model.tensors.len(), + }); + + loop_i += 1; + } + + load_progress_callback(LoadProgress::PartLoaded { + file: path, + byte_size: total_loaded_bytes as usize, + tensor_count: loop_i, + }); + + return Ok(()); }