Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,111 changes: 1,065 additions & 46 deletions Cargo.lock

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,21 @@ path = "src/lib.rs"
name = "noseyparker"
path = "src/bin/noseyparker/main.rs"

[features]
default = ["hyperscan"]
hyperscan = ["dep:hyperscan"]

[dependencies]
# anyhow = { version = "1.0", features = ["backtrace"] } # add backtraces to errors -- not sure how expensive this is
anyhow = { version = "1.0" }
atty = "0.2"
clap = { version = "4.0", features = ["cargo", "derive", "env", "unicode", "wrap_help"] }
git2 = { version = "0.15", features = ["vendored-libgit2", "vendored-openssl"] }
git-repository = { version = "0.30.0", features = ["max-performance"] }
libc = "0.2"
libgit2-sys = "*"
hex = "0.4"
hyperscan = { version = "0.3", features = ["full", "static"] }
hyperscan = { version = "0.3", features = ["full", "static"], optional = true }
# hyperscan-sys = { version = "0.3", features = ["full", "static"] }
include_dir = { version = "0.7", features = ["glob"] }
indenter = "0.3"
Expand Down
3 changes: 3 additions & 0 deletions src/bin/noseyparker/cmd_rules.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use anyhow::{Context, Result, bail};
#[cfg(feature = "hyperscan")]
use hyperscan::prelude::{pattern, BlockDatabase, Builder, Matching};

use tracing::{debug_span, error, error_span, info, warn};
Expand Down Expand Up @@ -45,6 +46,7 @@ fn cmd_rules_check(_global_args: &args::GlobalArgs, args: &args::RulesCheckArgs)
Ok(())
}

#[cfg(feature = "hyperscan")]
fn hs_compile_pattern(pat: &str) -> Result<BlockDatabase> {
let pattern = pattern! {pat};
let db: BlockDatabase = pattern.build()?;
Expand Down Expand Up @@ -123,6 +125,7 @@ fn check_rule(rule_num: usize, rule: &Rule) -> Result<CheckStats> {
// Ok(_db) => {}
// }

#[cfg(feature = "hyperscan")]
match hs_compile_pattern(&rule.uncommented_pattern()) {
Err(e) => {
error!("Hyperscan: failed to compile pattern: {}", e);
Expand Down
16 changes: 9 additions & 7 deletions src/bin/noseyparker/cmd_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::sync::mpsc;
use std::sync::Mutex;
use std::time::Instant;
use tracing::{debug, debug_span, error};
use git_repository as git;

use crate::args;

Expand Down Expand Up @@ -266,18 +267,18 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
// Scan Git repo inputs
// ---------------------------------------------------------------------------------------------
inputs.git_repos.par_iter().for_each(|git_repo_result| {
let repo = open_git_repo(&git_repo_result.path)
.ok()
.flatten()
.expect("should be able to re-open repository").into_sync();
git_repo_result
.blobs
.par_iter()
.with_min_len(128)
.for_each_init(
|| {
let repo = open_git_repo(&git_repo_result.path)
.ok()
.flatten()
.expect("should be able to re-open repository");
let matcher = make_matcher().expect("should be able to create a matcher");
(repo, matcher, progress.clone())
(repo.to_thread_local(), matcher, progress.clone())
},
|(repo, matcher, progress), (oid, size)| {
progress.inc(*size);
Expand All @@ -290,15 +291,16 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
if seen_blobs.contains(&blob_id) {
return;
}
let blob = match repo.find_blob(*oid) {
let blob = match repo.find_object(git::hash::ObjectId::from(oid.as_bytes())) {
Err(e) => {
error!(
"Failed to read blob {} from Git repository at {:?}: {}",
oid, path, e
);
return;
}
Ok(blob) => Blob::new(blob_id, blob.content().to_owned()),
// TODO: get rid of this extra copy
Ok(blob) => Blob::new(blob_id, blob.data.to_owned()),
};
let provenance = Provenance::FromGitRepo(path.to_path_buf());
match matcher.scan_blob(&blob, &provenance) {
Expand Down
11 changes: 11 additions & 0 deletions src/blob_id.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use anyhow::Result;
use git_repository as git;

// -------------------------------------------------------------------------------------------------
// BlobId
Expand Down Expand Up @@ -60,6 +61,16 @@ impl BlobId {
}
}

impl<'a> From<&'a git::ObjectId> for BlobId {
fn from(id: &'a git::ObjectId) -> Self {
BlobId(
id.as_bytes()
.try_into()
.expect("oid should be a 20-byte value"),
)
}
}

impl std::fmt::Display for BlobId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.hex())
Expand Down
50 changes: 32 additions & 18 deletions src/input_enumerator.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use anyhow::{bail, Result};
use anyhow::{bail, Context, Result};
use git2::{Oid, Repository, RepositoryOpenFlags};
use git_repository as git;
use ignore::{WalkBuilder, WalkState};
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
Expand Down Expand Up @@ -256,7 +257,7 @@ impl FilesystemEnumerator {
}

/// Opens the given Git repository if it exists, returning None otherwise.
pub fn open_git_repo(path: &Path) -> Result<Option<Repository>> {
pub fn open_git2_repo(path: &Path) -> Result<Option<Repository>> {
match Repository::open_ext(
path,
RepositoryOpenFlags::NO_SEARCH | RepositoryOpenFlags::NO_DOTGIT, // | RepositoryOpenFlags::BARE,
Expand All @@ -270,16 +271,32 @@ pub fn open_git_repo(path: &Path) -> Result<Option<Repository>> {
}
}

/// Opens the given Git repository if it exists, returning None otherwise.
pub fn open_git_repo(path: &Path) -> Result<Option<git::Repository>> {
match git::open_opts(
path,
{
let mut opts = git::open::Options::isolated();
opts.permissions.env.objects = git::sec::Permission::Allow;
opts
}
) {
Err(git::open::Error::NotARepository{ .. }) => Ok(None),
Err(err) => Err(err.into()),
Ok(r) => Ok(Some(r)),
}
}

pub struct GitRepoEnumeratorResult {
pub blobs: Vec<(Oid, u64)>,
}

pub struct GitRepoEnumerator<'a> {
repo: &'a Repository,
repo: &'a git::Repository,
}

impl<'a> GitRepoEnumerator<'a> {
pub fn new(repo: &'a Repository) -> Self {
pub fn new(repo: &'a git::Repository) -> Self {
GitRepoEnumerator { repo }
}

Expand All @@ -293,29 +310,26 @@ impl<'a> GitRepoEnumerator<'a> {
// }

pub fn run(&self, progress: &mut Progress) -> Result<GitRepoEnumeratorResult> {
use git::prelude::HeaderExt;
let mut blobs: Vec<(Oid, u64)> = Vec::new();

let odb = self.repo.odb()?;
odb.foreach(|oid: &git2::Oid| {
let (obj_size, obj_type) = match odb.read_header(*oid) {
Err(e) => {
error!("Failed to read object header {}: {}", oid, e);
return true;
}
Ok(v) => v,
};
let odb = &self.repo.objects;
for oid in odb.iter()?
.with_ordering(git::odb::store::iter::Ordering::PackAscendingOffsetThenLooseLexicographical)
.filter_map(Result::ok) {
let hdr = odb.header(oid).with_context(|| format!("Failed to read object header {}", oid))?;
let obj_type = hdr.kind();
match obj_type {
git2::ObjectType::Blob => {
let obj_size = obj_size as u64;
git::object::Kind::Blob => {
let obj_size = hdr.size() as u64;
progress.inc(obj_size);
blobs.push((*oid, obj_size));
blobs.push((git2::Oid::from_bytes(oid.as_bytes())?, obj_size));
// let read_size = odb.read(*oid).unwrap().len();
// assert_eq!(obj_size, read_size);
}
_ => {}
}
true
})?;
};

Ok(GitRepoEnumeratorResult { blobs })
}
Expand Down
4 changes: 2 additions & 2 deletions src/match_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ pub struct Match {

impl Match {
#[inline]
pub fn new<'r, 'b>(
pub fn new(
loc_mapping: &LocationMapping,
blob_match: BlobMatch<'r, 'b>,
blob_match: BlobMatch<'_, '_>,
provenance: &Provenance,
) -> Vec<Self> {
let offsets = &blob_match.matching_input_offset_span;
Expand Down
40 changes: 23 additions & 17 deletions src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ pub struct BlobMatch<'r, 'b> {
/// If doing multi-threaded scanning, use a separate `Matcher` for each thread.
pub struct Matcher<'a> {
/// A scratch buffer for Hyperscan
#[cfg(feature = "hyperscan")]
hs_scratch: hyperscan::Scratch,

/// A scratch vector for raw matches from Hyperscan, to minimize allocation
Expand Down Expand Up @@ -96,6 +97,7 @@ impl<'a> Matcher<'a> {
global_stats: Option<&'a Mutex<MatcherStats>>,
) -> Result<Self> {
Ok(Matcher {
#[cfg(feature = "hyperscan")]
hs_scratch: rules_db.hsdb.alloc_scratch()?,
raw_matches_scratch: Vec::with_capacity(16384),
rules_db,
Expand All @@ -106,25 +108,29 @@ impl<'a> Matcher<'a> {
}

#[inline]
#[cfg_attr(not(feature = "hyperscan"), allow(unused_variables))]
fn scan_bytes_raw(&mut self, input: &[u8]) -> Result<()> {
self.raw_matches_scratch.clear();
let input_len: u64 = input.len().try_into().unwrap();
self.rules_db
.hsdb
.scan(input, &self.hs_scratch, |id: u32, from: u64, to: u64, _flags: u32| {
// let start_idx = if from == hyperscan_sys::HS_OFFSET_PAST_HORIZON { 0 } else { from };
//
// NOTE: `from` is only going to be meaningful here if we start compiling rules
// with the HS_SOM_LEFTMOST flag. But it doesn't seem to hurt to use the 0-value
// provided when that flag is not used.
let start_idx = std::cmp::min(from.try_into().unwrap(), input_len);
self.raw_matches_scratch.push(RawMatch {
rule_id: id.try_into().unwrap(),
start_idx,
end_idx: to.try_into().unwrap(),
});
hyperscan::Matching::Continue
})?;
#[cfg(feature = "hyperscan")]
{
let input_len: u64 = input.len().try_into().unwrap();
self.rules_db
.hsdb
.scan(input, &self.hs_scratch, |id: u32, from: u64, to: u64, _flags: u32| {
// let start_idx = if from == hyperscan_sys::HS_OFFSET_PAST_HORIZON { 0 } else { from };
//
// NOTE: `from` is only going to be meaningful here if we start compiling rules
// with the HS_SOM_LEFTMOST flag. But it doesn't seem to hurt to use the 0-value
// provided when that flag is not used.
let start_idx = std::cmp::min(from.try_into().unwrap(), input_len);
self.raw_matches_scratch.push(RawMatch {
rule_id: id.try_into().unwrap(),
start_idx,
end_idx: to.try_into().unwrap(),
});
hyperscan::Matching::Continue
})?;
}
Ok(())
}

Expand Down
10 changes: 9 additions & 1 deletion src/rules_database.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use anyhow::{bail, Context, Result};
use anyhow::{bail, Result};
#[cfg(feature = "hyperscan")]
use anyhow::{Context};
#[cfg(feature = "hyperscan")]
use hyperscan::prelude::{Builder, Pattern, Patterns};
use regex::bytes::Regex;
use std::path::Path;
Expand All @@ -11,6 +14,7 @@ pub struct RulesDatabase {
// NOTE: pub(crate) here so that `Matcher` can access these
pub(crate) rules: Rules,
pub(crate) anchored_regexes: Vec<Regex>,
#[cfg(feature = "hyperscan")]
pub(crate) hsdb: hyperscan::BlockDatabase,
}

Expand All @@ -33,6 +37,7 @@ impl RulesDatabase {
bail!("No rules to compile");
}

#[cfg(feature = "hyperscan")]
let patterns = rules
.rules
.iter()
Expand All @@ -43,6 +48,7 @@ impl RulesDatabase {
.collect::<Result<Vec<Pattern>>>()?;

let t1 = Instant::now();
#[cfg(feature = "hyperscan")]
let hsdb = Patterns::build(&Patterns::from(patterns))?;
let d1 = t1.elapsed().as_secs_f64();

Expand All @@ -57,6 +63,7 @@ impl RulesDatabase {
debug!("Compiled {} rules: hyperscan {}s; regex {}s", rules.rules.len(), d1, d2);
Ok(RulesDatabase {
rules,
#[cfg(feature = "hyperscan")]
hsdb,
anchored_regexes,
})
Expand All @@ -71,6 +78,7 @@ impl RulesDatabase {
}

#[cfg(test)]
#[cfg(feature = "hyperscan")]
mod test {
use super::*;
use pretty_assertions::assert_eq;
Expand Down