Skip to content

Commit

Permalink
Merge pull request #22
Browse files Browse the repository at this point in the history
Implements the stream processing discussed in #18, which should greatly reduce memory usage during checking. Note that it currently does not limit the maximum line size ([code](https://github.com/loichyan/nerdfix/blob/db421eb/src/input.rs#L79)), and may cause a lot of allocation when reading super large lines (though I believe this should be an extremely rare case).

It also adds a file size limit (16MB by default) which can be manually specified by `--size-limit=1GB ...`.

Closes #18
  • Loading branch information
loichyan authored Jul 12, 2024
2 parents aa29181 + b0a6e53 commit 4b888e8
Show file tree
Hide file tree
Showing 7 changed files with 332 additions and 91 deletions.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ license = "MIT OR Apache-2.0"
edition = "2021"

[dependencies]
bytesize = "1.3"
clap = { version = ">=4.0, <4.5", features = ["derive"] }
codespan-reporting = "0.11.1"
content_inspector = "0.2.4"
Expand Down
42 changes: 33 additions & 9 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
//! Command line arguments parser.
use std::io::BufReader;
use std::path::PathBuf;
use std::str::FromStr;
use std::{fmt, fs, io};

use bytesize::ByteSize;
use clap::{Parser, Subcommand, ValueEnum};
use shadow_rs::formatcp;
use thisctx::IntoError;

use crate::icon::Substitution;
use crate::input::InputReader;
use crate::{error, shadow};

const V_PATH: &str = "PATH";
const V_SOURCE: &str = "SOURCE";
const V_SUBSTITUTION: &str = "SUBSTITUTION";
const V_FORMAT: &str = "FORMAT";
const V_SIZE: &str = "SIZE";
const DEFAULT_SIZE: &str = "16MB";
const INDEX_REV: &str = include_str!("index-rev");
const CLAP_LONG_VERSION: &str = formatcp!("{}\ncheat-sheet: {}", shadow::PKG_VERSION, INDEX_REV);

Expand Down Expand Up @@ -94,6 +99,9 @@ pub enum Command {
/// Do not skip binary files.
#[arg(long)]
include_binary: bool,
/// Set the file size limit (0 to disable it).
#[arg(long, value_name= V_SIZE, default_value = DEFAULT_SIZE)]
size_limit: ByteSize,
/// Path(s) of files to check.
#[arg(value_name = V_PATH)]
source: Vec<IoPath>,
Expand All @@ -115,6 +123,9 @@ pub enum Command {
/// Do not skip binary files.
#[arg(long)]
include_binary: bool,
/// Set the file size limit (0 to disable it).
#[arg(long, value_name= V_SIZE, default_value = DEFAULT_SIZE)]
size_limit: ByteSize,
/// Path tuple(s) of files to read from and write to.
///
/// Each tuple is an input path followed by an optional output path,
Expand Down Expand Up @@ -184,18 +195,31 @@ impl fmt::Display for IoPath {
}

impl IoPath {
pub fn read_all(&self) -> io::Result<Vec<u8>> {
let mut buf = Vec::new();
match self {
IoPath::Stdio => _ = io::Read::read_to_end(&mut io::stdin(), &mut buf)?,
IoPath::Path(path) => _ = io::Read::read_to_end(&mut fs::File::open(path)?, &mut buf)?,
};
Ok(buf)
pub fn metadata(&self) -> io::Result<Option<fs::Metadata>> {
if let IoPath::Path(path) = self {
fs::metadata(path).map(Some)
} else {
Ok(None)
}
}

pub fn file_size(&self) -> io::Result<Option<u64>> {
self.metadata().map(|t| t.map(|m| m.len()))
}

fn get_reader(&self) -> io::Result<Box<dyn io::BufRead>> {
Ok(match self {
IoPath::Stdio => Box::new(BufReader::new(io::stdin())) as _,
IoPath::Path(path) => Box::new(BufReader::new(fs::File::open(path)?)) as _,
})
}

pub fn open(&self) -> io::Result<InputReader> {
self.get_reader().map(InputReader::new)
}

pub fn read_to_string(&self) -> io::Result<String> {
self.read_all()
.map(|s| String::from_utf8_lossy(&s).as_ref().to_owned())
self.get_reader().and_then(io::read_to_string)
}

pub fn write_str(&self, content: &str) -> io::Result<()> {
Expand Down
11 changes: 9 additions & 2 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use thisctx::WithContext;
use thiserror::Error;

use crate::icon::Icon;
use crate::input::InputLine;
use crate::runtime::Severity;

pub type Result<T, E = Error> = std::result::Result<T, E>;
Expand Down Expand Up @@ -38,6 +39,12 @@ pub enum Error {
#[source]
inquire::InquireError,
),
#[error("Invalid UTF-8 input")]
Utf8(
#[from]
#[source]
std::str::Utf8Error,
),
#[error("Invalid input")]
InvalidInput,
#[error("Invalid codepoint")]
Expand All @@ -50,7 +57,7 @@ pub enum Error {

#[derive(Debug, Error)]
pub(crate) struct ObsoleteIcon<'a> {
pub source_code: &'a str,
pub source_code: &'a InputLine<'a>,
pub icon: &'a Icon,
pub span: (usize, usize),
pub candidates: &'a [&'a Icon],
Expand All @@ -64,7 +71,7 @@ impl fmt::Display for ObsoleteIcon<'_> {

impl Diagnostic for ObsoleteIcon<'_> {
fn source_code(&self) -> Option<&dyn miette::SourceCode> {
Some(&self.source_code)
Some(self.source_code)
}

fn severity(&self) -> Option<miette::Severity> {
Expand Down
175 changes: 175 additions & 0 deletions src/input.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
use std::collections::VecDeque;
use std::io::BufRead;
use std::{io, iter};

use content_inspector::ContentType;
use miette::{MietteSpanContents, SourceCode, SourceSpan, SpanContents};

const LINES_BEFORE: usize = 1;
const LINES_AFTER: usize = 3;

pub struct InputReader<R = Box<dyn BufRead>> {
reader: R,
buffer: Vec<u8>,
/// The absolute positions of each line in the buffer.
line_sizes: VecDeque<usize>,
/// The absolute line number of the current line.
line_count: usize,
/// The absolute position of the current line.
offset: usize,
/// The position of the current line relative to the buffer beginning.
rel_offset: usize,
}

impl<R: BufRead> InputReader<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
buffer: Vec::new(),
line_sizes: VecDeque::new(),
line_count: 0,
offset: 0,
rel_offset: 0,
}
}

pub fn next_line(&mut self) -> io::Result<Option<InputLine>> {
if let Some(n) = self.line_sizes.pop_front() {
// Shift to the second line
debug_assert!(self.line_sizes.len() >= LINES_BEFORE);
self.buffer.drain(..n);
self.rel_offset -= n;

// Peek one line ahead
self.read_line()?;
} else {
// In the initial call, pad precedent empty lines,
self.line_sizes.reserve(LINES_BEFORE + 1 + LINES_AFTER);
self.line_sizes.extend(iter::repeat(0).take(LINES_BEFORE));

// and then peek subsequent context lines
for _ in 0..=LINES_AFTER {
self.read_line()?;
}
}

let source;
if let Some(&size) = self.line_sizes.get(LINES_BEFORE) {
source = Some(InputLine {
buffer: &self.buffer,
line_sizes: &self.line_sizes,
line_count: self.line_count,
offset: self.offset,
rel_offset: self.rel_offset,
size,
});
self.line_count += 1;
self.offset += size;
self.rel_offset += size;
} else {
// EOF reached
self.line_count = usize::MAX;
source = None;
}

Ok(source)
}

fn read_line(&mut self) -> io::Result<usize> {
// TODO: limit line size
let size = self.reader.read_until(b'\n', &mut self.buffer)?;
if size != 0 {
self.line_sizes.push_back(size);
}
Ok(size)
}
}

#[derive(Debug)]
pub struct InputLine<'a> {
buffer: &'a [u8],
line_sizes: &'a VecDeque<usize>,
line_count: usize,
offset: usize,
rel_offset: usize,
size: usize,
}

impl<'a> InputLine<'a> {
/// Returns the content of this line.
pub fn contents(&self) -> &'a [u8] {
&self.buffer[self.rel_offset..self.rel_offset + self.size]
}

/// Returns the absolute offset of a byte index relative to the line start.
pub fn offset_of(&self, i: usize) -> usize {
self.offset + i
}

pub fn content_type(&self) -> ContentType {
content_inspector::inspect(self.buffer)
}
}

impl SourceCode for InputLine<'_> {
fn read_span<'a>(
&'a self,
span: &SourceSpan,
lines_before: usize,
lines_after: usize,
) -> Result<Box<dyn SpanContents<'a> + 'a>, miette::MietteError> {
debug_assert!((self.offset..self.offset + self.size).contains(&span.offset()));

let start;
let offset;
let line;
let column;
if lines_before == 0 {
offset = span.offset();
column = offset - self.offset;
start = self.rel_offset + column;
line = self.line_count;
} else {
// count precedent lines and bytes
let (lines, bytes) = self
.line_sizes
.range(0..LINES_BEFORE)
.copied()
.rev()
.take(lines_before)
.take_while(|&n| n > 0)
.fold((0, 0), |(lines, bytes), n| (lines + 1, bytes + n));

offset = self.offset - bytes;
column = 0;
start = self.rel_offset - bytes;
line = self.line_count - lines;
}

let end;
let line_count;
if lines_after == 0 {
end = start + span.len();
line_count = self.line_count;
} else {
// count subsequent lines and bytes
let (lines, bytes) = self
.line_sizes
.range(LINES_BEFORE..)
.copied()
.take(lines_before + 1)
.take_while(|&n| n > 0)
.fold((0, 0), |(lines, bytes), n| (lines + 1, bytes + n));

end = self.rel_offset + bytes;
line_count = self.line_count + lines;
}

let data = &self.buffer[start..end];
let span = SourceSpan::from((offset, end - start));

Ok(Box::new(MietteSpanContents::new(
data, span, line, column, line_count,
)))
}
}
5 changes: 5 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ mod autocomplete;
mod cli;
mod error;
mod icon;
mod input;
mod parser;
mod prompt;
mod runtime;
Expand Down Expand Up @@ -108,12 +109,14 @@ fn main_impl() -> error::Result<()> {
source,
recursive,
include_binary,
size_limit,
} => {
let rt = rt.build();
let mut context = CheckerContext {
format,
writer: Box::new(std::io::stdout()),
include_binary,
size_limit: size_limit.as_u64(),
..Default::default()
};
for source in walk(source.into_iter().map(|p| Source(p, None)), recursive) {
Expand All @@ -131,6 +134,7 @@ fn main_impl() -> error::Result<()> {
select_first,
recursive,
include_binary,
size_limit,
source,
} => {
if yes {
Expand All @@ -141,6 +145,7 @@ fn main_impl() -> error::Result<()> {
write,
select_first,
include_binary,
size_limit: size_limit.as_u64(),
..Default::default()
};
let mut buffer = String::new();
Expand Down
Loading

0 comments on commit 4b888e8

Please sign in to comment.