wip

helix-editor · Dec 3, 2022 · 8dc48de · 8dc48de
1 parent bcdb475
commit 8dc48de
Show file tree

Hide file tree

Showing 5 changed files with 239 additions and 184 deletions.
diff --git a/helix-term/src/commands.rs b/helix-term/src/commands.rs
@@ -4785,7 +4785,7 @@ async fn shell_impl_async(
     let output = if let Some(mut stdin) = process.stdin.take() {
         let input_task = tokio::spawn(async move {
             if let Some(input) = input {
-                helix_view::document::to_writer(&mut stdin, encoding::UTF_8, &input).await?;
+                helix_view::stream::to_writer(&mut stdin, encoding::UTF_8, &input).await?;
             }
             Ok::<_, anyhow::Error>(())
         });

diff --git a/helix-view/src/document.rs b/helix-view/src/document.rs
@@ -27,11 +27,9 @@ use helix_core::{
 };
 
 use crate::editor::RedrawHandle;
+use crate::stream::{from_reader, to_writer};
 use crate::{apply_transaction, DocumentId, Editor, View, ViewId};
 
-/// 8kB of buffer space for encoding and decoding `Rope`s.
-const BUF_SIZE: usize = 8192;
-
 const DEFAULT_INDENT: IndentStyle = IndentStyle::Tabs;
 
 pub const SCRATCH_BUFFER_NAME: &str = "[scratch]";
@@ -164,181 +162,6 @@ impl fmt::Debug for Document {
     }
 }
 
-// The documentation and implementation of this function should be up-to-date with
-// its sibling function, `to_writer()`.
-//
-/// Decodes a stream of bytes into UTF-8, returning a `Rope` and the
-/// encoding it was decoded as. The optional `encoding` parameter can
-/// be used to override encoding auto-detection.
-pub fn from_reader<R: std::io::Read + ?Sized>(
-    reader: &mut R,
-    encoding: Option<&'static encoding::Encoding>,
-) -> Result<(Rope, &'static encoding::Encoding), Error> {
-    // These two buffers are 8192 bytes in size each and are used as
-    // intermediaries during the decoding process. Text read into `buf`
-    // from `reader` is decoded into `buf_out` as UTF-8. Once either
-    // `buf_out` is full or the end of the reader was reached, the
-    // contents are appended to `builder`.
-    let mut buf = [0u8; BUF_SIZE];
-    let mut buf_out = [0u8; BUF_SIZE];
-    let mut builder = RopeBuilder::new();
-
-    // By default, the encoding of the text is auto-detected via the
-    // `chardetng` crate which requires sample data from the reader.
-    // As a manual override to this auto-detection is possible, the
-    // same data is read into `buf` to ensure symmetry in the upcoming
-    // loop.
-    let (encoding, mut decoder, mut slice, mut is_empty) = {
-        let read = reader.read(&mut buf)?;
-        let is_empty = read == 0;
-        let encoding = encoding.unwrap_or_else(|| {
-            let mut encoding_detector = chardetng::EncodingDetector::new();
-            encoding_detector.feed(&buf, is_empty);
-            encoding_detector.guess(None, true)
-        });
-        let decoder = encoding.new_decoder();
-
-        // If the amount of bytes read from the reader is less than
-        // `buf.len()`, it is undesirable to read the bytes afterwards.
-        let slice = &buf[..read];
-        (encoding, decoder, slice, is_empty)
-    };
-
-    // `RopeBuilder::append()` expects a `&str`, so this is the "real"
-    // output buffer. When decoding, the number of bytes in the output
-    // buffer will often exceed the number of bytes in the input buffer.
-    // The `result` returned by `decode_to_str()` will state whether or
-    // not that happened. The contents of `buf_str` is appended to
-    // `builder` and it is reused for the next iteration of the decoding
-    // loop.
-    //
-    // As it is possible to read less than the buffer's maximum from `read()`
-    // even when the end of the reader has yet to be reached, the end of
-    // the reader is determined only when a `read()` call returns `0`.
-    //
-    // SAFETY: `buf_out` is a zero-initialized array, thus it will always
-    // contain valid UTF-8.
-    let buf_str = unsafe { std::str::from_utf8_unchecked_mut(&mut buf_out[..]) };
-    let mut total_written = 0usize;
-    loop {
-        let mut total_read = 0usize;
-
-        // An inner loop is necessary as it is possible that the input buffer
-        // may not be completely decoded on the first `decode_to_str()` call
-        // which would happen in cases where the output buffer is filled to
-        // capacity.
-        loop {
-            let (result, read, written, ..) = decoder.decode_to_str(
-                &slice[total_read..],
-                &mut buf_str[total_written..],
-                is_empty,
-            );
-
-            // These variables act as the read and write cursors of `buf` and `buf_str` respectively.
-            // They are necessary in case the output buffer fills before decoding of the entire input
-            // loop is complete. Otherwise, the loop would endlessly iterate over the same `buf` and
-            // the data inside the output buffer would be overwritten.
-            total_read += read;
-            total_written += written;
-            match result {
-                encoding::CoderResult::InputEmpty => {
-                    debug_assert_eq!(slice.len(), total_read);
-                    break;
-                }
-                encoding::CoderResult::OutputFull => {
-                    debug_assert!(slice.len() > total_read);
-                    builder.append(&buf_str[..total_written]);
-                    total_written = 0;
-                }
-            }
-        }
-        // Once the end of the stream is reached, the output buffer is
-        // flushed and the loop terminates.
-        if is_empty {
-            debug_assert_eq!(reader.read(&mut buf)?, 0);
-            builder.append(&buf_str[..total_written]);
-            break;
-        }
-
-        // Once the previous input has been processed and decoded, the next set of
-        // data is fetched from the reader. The end of the reader is determined to
-        // be when exactly `0` bytes were read from the reader, as per the invariants
-        // of the `Read` trait.
-        let read = reader.read(&mut buf)?;
-        slice = &buf[..read];
-        is_empty = read == 0;
-    }
-    let rope = builder.finish();
-    Ok((rope, encoding))
-}
-
-// The documentation and implementation of this function should be up-to-date with
-// its sibling function, `from_reader()`.
-//
-/// Encodes the text inside `rope` into the given `encoding` and writes the
-/// encoded output into `writer.` As a `Rope` can only contain valid UTF-8,
-/// replacement characters may appear in the encoded text.
-pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
-    writer: &'a mut W,
-    encoding: &'static encoding::Encoding,
-    rope: &'a Rope,
-) -> Result<(), Error> {
-    // Text inside a `Rope` is stored as non-contiguous blocks of data called
-    // chunks. The absolute size of each chunk is unknown, thus it is impossible
-    // to predict the end of the chunk iterator ahead of time. Instead, it is
-    // determined by filtering the iterator to remove all empty chunks and then
-    // appending an empty chunk to it. This is valuable for detecting when all
-    // chunks in the `Rope` have been iterated over in the subsequent loop.
-    let iter = rope
-        .chunks()
-        .filter(|c| !c.is_empty())
-        .chain(std::iter::once(""));
-    let mut buf = [0u8; BUF_SIZE];
-    let mut encoder = encoding.new_encoder();
-    let mut total_written = 0usize;
-    for chunk in iter {
-        let is_empty = chunk.is_empty();
-        let mut total_read = 0usize;
-
-        // An inner loop is necessary as it is possible that the input buffer
-        // may not be completely encoded on the first `encode_from_utf8()` call
-        // which would happen in cases where the output buffer is filled to
-        // capacity.
-        loop {
-            let (result, read, written, ..) =
-                encoder.encode_from_utf8(&chunk[total_read..], &mut buf[total_written..], is_empty);
-
-            // These variables act as the read and write cursors of `chunk` and `buf` respectively.
-            // They are necessary in case the output buffer fills before encoding of the entire input
-            // loop is complete. Otherwise, the loop would endlessly iterate over the same `chunk` and
-            // the data inside the output buffer would be overwritten.
-            total_read += read;
-            total_written += written;
-            match result {
-                encoding::CoderResult::InputEmpty => {
-                    debug_assert_eq!(chunk.len(), total_read);
-                    debug_assert!(buf.len() >= total_written);
-                    break;
-                }
-                encoding::CoderResult::OutputFull => {
-                    debug_assert!(chunk.len() > total_read);
-                    writer.write_all(&buf[..total_written]).await?;
-                    total_written = 0;
-                }
-            }
-        }
-
-        // Once the end of the iterator is reached, the output buffer is
-        // flushed and the outer loop terminates.
-        if is_empty {
-            writer.write_all(&buf[..total_written]).await?;
-            writer.flush().await?;
-            break;
-        }
-    }
-    Ok(())
-}
-
 fn take_with<T, F>(mut_ref: &mut T, f: F)
 where
     T: Default,
@@ -392,7 +215,7 @@ impl Document {
         let (rope, encoding) = if path.exists() {
             let mut file =
                 std::fs::File::open(path).context(format!("unable to open {:?}", path))?;
-            from_reader(&mut file, encoding)?
+            from_reader::<_, RopeBuilder>(&mut file, encoding)?
         } else {
             let encoding = encoding.unwrap_or(encoding::UTF_8);
             (Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding)
@@ -644,7 +467,7 @@ impl Document {
             .to_owned();
 
         let mut file = std::fs::File::open(&path)?;
-        let (rope, ..) = from_reader(&mut file, Some(encoding))?;
+        let (rope, ..) = from_reader::<_, RopeBuilder>(&mut file, Some(encoding))?;
 
         // Calculate the difference between the buffer and source text, and apply it.
         // This is not considered a modification of the contents of the file regardless
@@ -1071,7 +894,9 @@ impl Document {
 
     /// Intialize/updates the differ for this document with a new base.
     pub fn set_diff_base(&mut self, diff_base: Vec<u8>, redraw_handle: RedrawHandle) {
-        if let Ok((diff_base, _)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) {
+        if let Ok((diff_base, _)) =
+            from_reader::<_, RopeBuilder>(&mut diff_base.as_slice(), Some(self.encoding))
+        {
             if let Some(differ) = &self.diff_handle {
                 differ.update_diff_base(diff_base);
                 return;
@@ -1409,7 +1234,7 @@ mod test {
                 assert!(ref_path.exists());
 
                 let mut file = std::fs::File::open(path).unwrap();
-                let text = from_reader(&mut file, Some(encoding))
+                let text = from_reader::<_, RopeBuilder>(&mut file, Some(encoding))
                     .unwrap()
                     .0
                     .to_string();

diff --git a/helix-view/src/editor.rs b/helix-view/src/editor.rs
@@ -1114,7 +1114,8 @@ impl Editor {
     }
 
     pub fn new_file_from_stdin(&mut self, action: Action) -> Result<DocumentId, Error> {
-        let (rope, encoding) = crate::document::from_reader(&mut stdin(), None)?;
+        let (rope, encoding) =
+            crate::stream::from_reader::<_, helix_core::RopeBuilder>(&mut stdin(), None)?;
         Ok(self.new_file_from_document(action, Document::from(rope, Some(encoding))))
     }
 

diff --git a/helix-view/src/lib.rs b/helix-view/src/lib.rs
@@ -7,6 +7,7 @@ pub mod editor;
 pub mod env;
 pub mod graphics;
 pub mod gutter;
+pub mod stream;
 pub mod handlers {
     pub mod dap;
     pub mod lsp;