Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make from_reader and to_writer generic #4983

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion helix-term/src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4785,7 +4785,7 @@ async fn shell_impl_async(
let output = if let Some(mut stdin) = process.stdin.take() {
let input_task = tokio::spawn(async move {
if let Some(input) = input {
helix_view::document::to_writer(&mut stdin, encoding::UTF_8, &input).await?;
helix_view::stream::to_writer(&mut stdin, encoding::UTF_8, input.chunks()).await?;
}
Ok::<_, anyhow::Error>(())
});
Expand Down
265 changes: 17 additions & 248 deletions helix-view/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,13 @@ use helix_core::{
indent::{auto_detect_indent_style, IndentStyle},
line_ending::auto_detect_line_ending,
syntax::{self, LanguageConfiguration},
ChangeSet, Diagnostic, LineEnding, Rope, RopeBuilder, Selection, Syntax, Transaction,
DEFAULT_LINE_ENDING,
ChangeSet, Diagnostic, LineEnding, Rope, Selection, Syntax, Transaction, DEFAULT_LINE_ENDING,
};

use crate::editor::RedrawHandle;
use crate::stream::{from_reader, to_writer, RopeWrite};
use crate::{apply_transaction, DocumentId, Editor, View, ViewId};

/// 8kB of buffer space for encoding and decoding `Rope`s.
const BUF_SIZE: usize = 8192;

const DEFAULT_INDENT: IndentStyle = IndentStyle::Tabs;

pub const SCRATCH_BUFFER_NAME: &str = "[scratch]";
Expand Down Expand Up @@ -164,181 +161,6 @@ impl fmt::Debug for Document {
}
}

// The documentation and implementation of this function should be up-to-date with
// its sibling function, `to_writer()`.
//
/// Decodes a stream of bytes into UTF-8, returning a `Rope` and the
/// encoding it was decoded as. The optional `encoding` parameter can
/// be used to override encoding auto-detection.
pub fn from_reader<R: std::io::Read + ?Sized>(
reader: &mut R,
encoding: Option<&'static encoding::Encoding>,
) -> Result<(Rope, &'static encoding::Encoding), Error> {
// These two buffers are 8192 bytes in size each and are used as
// intermediaries during the decoding process. Text read into `buf`
// from `reader` is decoded into `buf_out` as UTF-8. Once either
// `buf_out` is full or the end of the reader was reached, the
// contents are appended to `builder`.
let mut buf = [0u8; BUF_SIZE];
let mut buf_out = [0u8; BUF_SIZE];
let mut builder = RopeBuilder::new();

// By default, the encoding of the text is auto-detected via the
// `chardetng` crate which requires sample data from the reader.
// As a manual override to this auto-detection is possible, the
// same data is read into `buf` to ensure symmetry in the upcoming
// loop.
let (encoding, mut decoder, mut slice, mut is_empty) = {
let read = reader.read(&mut buf)?;
let is_empty = read == 0;
let encoding = encoding.unwrap_or_else(|| {
let mut encoding_detector = chardetng::EncodingDetector::new();
encoding_detector.feed(&buf, is_empty);
encoding_detector.guess(None, true)
});
let decoder = encoding.new_decoder();

// If the amount of bytes read from the reader is less than
// `buf.len()`, it is undesirable to read the bytes afterwards.
let slice = &buf[..read];
(encoding, decoder, slice, is_empty)
};

// `RopeBuilder::append()` expects a `&str`, so this is the "real"
// output buffer. When decoding, the number of bytes in the output
// buffer will often exceed the number of bytes in the input buffer.
// The `result` returned by `decode_to_str()` will state whether or
// not that happened. The contents of `buf_str` is appended to
// `builder` and it is reused for the next iteration of the decoding
// loop.
//
// As it is possible to read less than the buffer's maximum from `read()`
// even when the end of the reader has yet to be reached, the end of
// the reader is determined only when a `read()` call returns `0`.
//
// SAFETY: `buf_out` is a zero-initialized array, thus it will always
// contain valid UTF-8.
let buf_str = unsafe { std::str::from_utf8_unchecked_mut(&mut buf_out[..]) };
let mut total_written = 0usize;
loop {
let mut total_read = 0usize;

// An inner loop is necessary as it is possible that the input buffer
// may not be completely decoded on the first `decode_to_str()` call
// which would happen in cases where the output buffer is filled to
// capacity.
loop {
let (result, read, written, ..) = decoder.decode_to_str(
&slice[total_read..],
&mut buf_str[total_written..],
is_empty,
);

// These variables act as the read and write cursors of `buf` and `buf_str` respectively.
// They are necessary in case the output buffer fills before decoding of the entire input
// loop is complete. Otherwise, the loop would endlessly iterate over the same `buf` and
// the data inside the output buffer would be overwritten.
total_read += read;
total_written += written;
match result {
encoding::CoderResult::InputEmpty => {
debug_assert_eq!(slice.len(), total_read);
break;
}
encoding::CoderResult::OutputFull => {
debug_assert!(slice.len() > total_read);
builder.append(&buf_str[..total_written]);
total_written = 0;
}
}
}
// Once the end of the stream is reached, the output buffer is
// flushed and the loop terminates.
if is_empty {
debug_assert_eq!(reader.read(&mut buf)?, 0);
builder.append(&buf_str[..total_written]);
break;
}

// Once the previous input has been processed and decoded, the next set of
// data is fetched from the reader. The end of the reader is determined to
// be when exactly `0` bytes were read from the reader, as per the invariants
// of the `Read` trait.
let read = reader.read(&mut buf)?;
slice = &buf[..read];
is_empty = read == 0;
}
let rope = builder.finish();
Ok((rope, encoding))
}

// The documentation and implementation of this function should be up-to-date with
// its sibling function, `from_reader()`.
//
/// Encodes the text inside `rope` into the given `encoding` and writes the
/// encoded output into `writer.` As a `Rope` can only contain valid UTF-8,
/// replacement characters may appear in the encoded text.
pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
writer: &'a mut W,
encoding: &'static encoding::Encoding,
rope: &'a Rope,
) -> Result<(), Error> {
// Text inside a `Rope` is stored as non-contiguous blocks of data called
// chunks. The absolute size of each chunk is unknown, thus it is impossible
// to predict the end of the chunk iterator ahead of time. Instead, it is
// determined by filtering the iterator to remove all empty chunks and then
// appending an empty chunk to it. This is valuable for detecting when all
// chunks in the `Rope` have been iterated over in the subsequent loop.
let iter = rope
.chunks()
.filter(|c| !c.is_empty())
.chain(std::iter::once(""));
let mut buf = [0u8; BUF_SIZE];
let mut encoder = encoding.new_encoder();
let mut total_written = 0usize;
for chunk in iter {
let is_empty = chunk.is_empty();
let mut total_read = 0usize;

// An inner loop is necessary as it is possible that the input buffer
// may not be completely encoded on the first `encode_from_utf8()` call
// which would happen in cases where the output buffer is filled to
// capacity.
loop {
let (result, read, written, ..) =
encoder.encode_from_utf8(&chunk[total_read..], &mut buf[total_written..], is_empty);

// These variables act as the read and write cursors of `chunk` and `buf` respectively.
// They are necessary in case the output buffer fills before encoding of the entire input
// loop is complete. Otherwise, the loop would endlessly iterate over the same `chunk` and
// the data inside the output buffer would be overwritten.
total_read += read;
total_written += written;
match result {
encoding::CoderResult::InputEmpty => {
debug_assert_eq!(chunk.len(), total_read);
debug_assert!(buf.len() >= total_written);
break;
}
encoding::CoderResult::OutputFull => {
debug_assert!(chunk.len() > total_read);
writer.write_all(&buf[..total_written]).await?;
total_written = 0;
}
}
}

// Once the end of the iterator is reached, the output buffer is
// flushed and the outer loop terminates.
if is_empty {
writer.write_all(&buf[..total_written]).await?;
writer.flush().await?;
break;
}
}
Ok(())
}

fn take_with<T, F>(mut_ref: &mut T, f: F)
where
T: Default,
Expand Down Expand Up @@ -392,7 +214,9 @@ impl Document {
let (rope, encoding) = if path.exists() {
let mut file =
std::fs::File::open(path).context(format!("unable to open {:?}", path))?;
from_reader(&mut file, encoding)?
let (builder, encoding) =
from_reader(&mut file, crate::stream::RopeWrite::default(), encoding)?;
(builder.finish(), encoding)
} else {
let encoding = encoding.unwrap_or(encoding::UTF_8);
(Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding)
Expand Down Expand Up @@ -449,7 +273,7 @@ impl Document {
})?;
{
let mut stdin = process.stdin.take().ok_or(FormatterError::BrokenStdin)?;
to_writer(&mut stdin, encoding::UTF_8, &text)
to_writer(&mut stdin, encoding::UTF_8, text.chunks())
.await
.map_err(|_| FormatterError::BrokenStdin)?;
}
Expand Down Expand Up @@ -578,7 +402,7 @@ impl Document {
}

let mut file = File::create(&path).await?;
to_writer(&mut file, encoding, &text).await?;
to_writer(&mut file, encoding, text.chunks()).await?;

let event = DocumentSavedEvent {
revision: current_rev,
Expand Down Expand Up @@ -644,7 +468,9 @@ impl Document {
.to_owned();

let mut file = std::fs::File::open(&path)?;
let (rope, ..) = from_reader(&mut file, Some(encoding))?;
let rope = from_reader(&mut file, RopeWrite::default(), Some(encoding))?
.0
.finish();

// Calculate the difference between the buffer and source text, and apply it.
// This is not considered a modification of the contents of the file regardless
Expand Down Expand Up @@ -1071,7 +897,13 @@ impl Document {

/// Intialize/updates the differ for this document with a new base.
pub fn set_diff_base(&mut self, diff_base: Vec<u8>, redraw_handle: RedrawHandle) {
if let Ok((diff_base, _)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) {
if let Ok(diff_base) = from_reader(
&mut diff_base.as_slice(),
RopeWrite::default(),
Some(self.encoding),
)
.map(|(builder, _)| builder.finish())
{
if let Some(differ) = &self.diff_handle {
differ.update_diff_base(diff_base);
return;
Expand Down Expand Up @@ -1396,67 +1228,4 @@ mod test {
DEFAULT_LINE_ENDING.as_str()
);
}

macro_rules! decode {
($name:ident, $label:expr, $label_override:expr) => {
#[test]
fn $name() {
let encoding = encoding::Encoding::for_label($label_override.as_bytes()).unwrap();
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/encoding");
let path = base_path.join(format!("{}_in.txt", $label));
let ref_path = base_path.join(format!("{}_in_ref.txt", $label));
assert!(path.exists());
assert!(ref_path.exists());

let mut file = std::fs::File::open(path).unwrap();
let text = from_reader(&mut file, Some(encoding))
.unwrap()
.0
.to_string();
let expectation = std::fs::read_to_string(ref_path).unwrap();
assert_eq!(text[..], expectation[..]);
}
};
($name:ident, $label:expr) => {
decode!($name, $label, $label);
};
}

macro_rules! encode {
($name:ident, $label:expr, $label_override:expr) => {
#[test]
fn $name() {
let encoding = encoding::Encoding::for_label($label_override.as_bytes()).unwrap();
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/encoding");
let path = base_path.join(format!("{}_out.txt", $label));
let ref_path = base_path.join(format!("{}_out_ref.txt", $label));
assert!(path.exists());
assert!(ref_path.exists());

let text = Rope::from_str(&std::fs::read_to_string(path).unwrap());
let mut buf: Vec<u8> = Vec::new();
helix_lsp::block_on(to_writer(&mut buf, encoding, &text)).unwrap();

let expectation = std::fs::read(ref_path).unwrap();
assert_eq!(buf, expectation);
}
};
($name:ident, $label:expr) => {
encode!($name, $label, $label);
};
}

decode!(big5_decode, "big5");
encode!(big5_encode, "big5");
decode!(euc_kr_decode, "euc_kr", "EUC-KR");
encode!(euc_kr_encode, "euc_kr", "EUC-KR");
decode!(gb18030_decode, "gb18030");
encode!(gb18030_encode, "gb18030");
decode!(iso_2022_jp_decode, "iso_2022_jp", "ISO-2022-JP");
encode!(iso_2022_jp_encode, "iso_2022_jp", "ISO-2022-JP");
decode!(jis0208_decode, "jis0208", "EUC-JP");
encode!(jis0208_encode, "jis0208", "EUC-JP");
decode!(jis0212_decode, "jis0212", "EUC-JP");
decode!(shift_jis_decode, "shift_jis");
encode!(shift_jis_encode, "shift_jis");
}
5 changes: 3 additions & 2 deletions helix-view/src/editor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1114,8 +1114,9 @@ impl Editor {
}

pub fn new_file_from_stdin(&mut self, action: Action) -> Result<DocumentId, Error> {
let (rope, encoding) = crate::document::from_reader(&mut stdin(), None)?;
Ok(self.new_file_from_document(action, Document::from(rope, Some(encoding))))
let (builder, encoding) =
crate::stream::from_reader(&mut stdin(), crate::stream::RopeWrite::default(), None)?;
Ok(self.new_file_from_document(action, Document::from(builder.finish(), Some(encoding))))
}

// ??? possible use for integration tests
Expand Down
1 change: 1 addition & 0 deletions helix-view/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pub mod editor;
pub mod env;
pub mod graphics;
pub mod gutter;
pub mod stream;
pub mod handlers {
pub mod dap;
pub mod lsp;
Expand Down
Loading