diff --git a/ruby/command-t/ext/command-t/match.c b/ruby/command-t/ext/command-t/match.c index 04b247ae..71761ec5 100644 --- a/ruby/command-t/ext/command-t/match.c +++ b/ruby/command-t/ext/command-t/match.c @@ -10,20 +10,18 @@ // Use a struct to make passing params during recursion easier. typedef struct { - char *haystack_p; // Pointer to the path string to be searched. - long haystack_len; // Length of same. - char *needle_p; // Pointer to search string (needle). - long needle_len; // Length of same. - long *rightmost_match_p; // Rightmost match for each char in needle. - float max_score_per_char; - int always_show_dot_files; // Boolean. - int never_show_dot_files; // Boolean. - int case_sensitive; // Boolean. - int recurse; // Boolean. - float *memo; // Memoization. + const char *haystack_p; // Pointer to the path string to be searched. + long haystack_len; // Length of same. + const char *needle_p; // Pointer to search string (needle). + long needle_len; // Length of same. + long *rightmost_match_p; // Rightmost match for each char in needle. + float max_score_per_char; + int case_sensitive; // Boolean. + int recurse; // Boolean. + float *memo; // Memoization. } matchinfo_t; -float recursive_match( +static float recursive_match( matchinfo_t *m, // Sharable meta-data. long haystack_idx, // Where in the path string to start. long needle_idx, // Where in the needle string to start. @@ -48,17 +46,7 @@ float recursive_match( } c = m->needle_p[i]; d = m->haystack_p[j]; - if (d == '.') { - if (j == 0 || m->haystack_p[j - 1] == '/') { // This is a dot-file. - int dot_search = c == '.'; // Searching for a dot. - if ( - m->never_show_dot_files || - (!dot_search && !m->always_show_dot_files) - ) { - return *memoized = 0.0; - } - } - } else if (d >= 'A' && d <= 'Z' && !m->case_sensitive) { + if (d >= 'A' && d <= 'Z' && !m->case_sensitive) { d += 'a' - 'A'; // Add 32 to downcase. } @@ -120,123 +108,94 @@ float recursive_match( } float calculate_match( + const char *haystack, + size_t haystack_len, VALUE needle, VALUE case_sensitive, - VALUE always_show_dot_files, - VALUE never_show_dot_files, - VALUE recurse, - long needle_bitmask, - match_t *haystack + VALUE recurse ) { matchinfo_t m; long i; float score = 1.0; - int compute_bitmasks = haystack->bitmask == UNSET_BITMASK; - m.haystack_p = haystack->path; - m.haystack_len = haystack->path_len; + m.haystack_p = haystack; + m.haystack_len = haystack_len; m.needle_p = RSTRING_PTR(needle); m.needle_len = RSTRING_LEN(needle); m.rightmost_match_p = NULL; m.max_score_per_char = (1.0 / m.haystack_len + 1.0 / m.needle_len) / 2; - m.always_show_dot_files = always_show_dot_files == Qtrue; - m.never_show_dot_files = never_show_dot_files == Qtrue; m.case_sensitive = (int)case_sensitive; m.recurse = recurse == Qtrue; // Special case for zero-length search string. - if (m.needle_len == 0) { - // Filter out dot files. - if (m.never_show_dot_files || !m.always_show_dot_files) { - for (i = 0; i < m.haystack_len; i++) { - char c = m.haystack_p[i]; - if (c == '.' && (i == 0 || m.haystack_p[i - 1] == '/')) { - return 0.0; - } - } - } - } else { - long haystack_limit; - long memo_size; - long needle_idx; - long mask; - long rightmost_match_p[m.needle_len]; - - if (haystack->bitmask != UNSET_BITMASK) { - if ((needle_bitmask & haystack->bitmask) != needle_bitmask) { - return 0.0; - } + if (m.needle_len == 0) return score; + + long haystack_limit; + long memo_size; + long needle_idx; + long rightmost_match_p[m.needle_len]; + + // Pre-scan string: + // - Bail if it can't match at all. + // - Record rightmost match for each character (prune search space). + m.rightmost_match_p = rightmost_match_p; + needle_idx = m.needle_len - 1; + for (i = m.haystack_len - 1; i >= 0; i--) { + char c = m.haystack_p[i]; + char lower = c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; + if (!m.case_sensitive) { + c = lower; } - // Pre-scan string: - // - Bail if it can't match at all. - // - Record rightmost match for each character (prune search space). - // - Record bitmask for haystack to speed up future searches. - m.rightmost_match_p = rightmost_match_p; - needle_idx = m.needle_len - 1; - mask = 0; - for (i = m.haystack_len - 1; i >= 0; i--) { - char c = m.haystack_p[i]; - char lower = c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; - if (!m.case_sensitive) { - c = lower; - } - if (compute_bitmasks) { - mask |= (1 << (lower - 'a')); - } - - if (needle_idx >= 0) { - char d = m.needle_p[needle_idx]; - if (c == d) { - rightmost_match_p[needle_idx] = i; - needle_idx--; - } + if (needle_idx >= 0) { + char d = m.needle_p[needle_idx]; + if (c == d) { + rightmost_match_p[needle_idx] = i; + needle_idx--; } } - if (compute_bitmasks) { - haystack->bitmask = mask; - } - if (needle_idx != -1) { - return 0.0; - } + } + if (needle_idx != -1) { + return 0.0; + } - // Prepare for memoization. - haystack_limit = rightmost_match_p[m.needle_len - 1] + 1; - memo_size = m.needle_len * haystack_limit; - { - float memo[memo_size]; - for (i = 0; i < memo_size; i++) { - memo[i] = UNSET_SCORE; - } - m.memo = memo; - score = recursive_match(&m, 0, 0, 0, 0.0); + // Prepare for memoization. + haystack_limit = rightmost_match_p[m.needle_len - 1] + 1; + memo_size = m.needle_len * haystack_limit; + { + float memo[memo_size]; + for (i = 0; i < memo_size; i++) { + memo[i] = UNSET_SCORE; + } + m.memo = memo; + score = recursive_match(&m, 0, 0, 0, 0.0); #ifdef DEBUG - fprintf(stdout, " "); - for (i = 0; i < m.needle_len; i++) { - fprintf(stdout, " %c ", m.needle_p[i]); + fprintf(stderr, " "); + for (i = 0; i < m.needle_len; i++) { + fprintf(stderr, " %c ", m.needle_p[i]); + } + fprintf(stderr, "\n"); + for (i = 0; i < memo_size; i++) { + char formatted[8]; + if (i % m.needle_len == 0) { + long haystack_idx = i / m.needle_len; + fprintf(stderr, "%c: ", m.haystack_p[haystack_idx]); } - fprintf(stdout, "\n"); - for (i = 0; i < memo_size; i++) { - char formatted[8]; - if (i % m.needle_len == 0) { - long haystack_idx = i / m.needle_len; - fprintf(stdout, "%c: ", m.haystack_p[haystack_idx]); - } - if (memo[i] == UNSET_SCORE) { - snprintf(formatted, sizeof(formatted), " - "); - } else { - snprintf(formatted, sizeof(formatted), " %-.4f", memo[i]); - } - fprintf(stdout, "%s", formatted); - if ((i + 1) % m.needle_len == 0) { - fprintf(stdout, "\n"); - } else { - fprintf(stdout, " "); - } + if (memo[i] == UNSET_SCORE) { + snprintf(formatted, sizeof(formatted), " - "); + } else { + snprintf(formatted, sizeof(formatted), " %-.4f", memo[i]); + } + fprintf(stderr, "%s", formatted); + if ((i + 1) % m.needle_len == 0) { + fprintf(stderr, "\n"); + } else { + fprintf(stderr, " "); } - fprintf(stdout, "Final score: %f\n\n", score); -#endif } + fprintf(stderr, "Final score: %f\n\n", score); +#endif } + return score; } diff --git a/ruby/command-t/ext/command-t/match.h b/ruby/command-t/ext/command-t/match.h index af497a62..777a76c3 100644 --- a/ruby/command-t/ext/command-t/match.h +++ b/ruby/command-t/ext/command-t/match.h @@ -6,30 +6,21 @@ #include +#include "scanner.h" + #define UNSET_BITMASK (-1) // Struct for representing an individual match. typedef struct { - char *path; - int32_t path_len; float score; - long bitmask; + paths_t *path; } match_t; -// Struct for representing a collection of matches. -typedef struct { - int len; - match_t matches[]; -} matches_t; - extern float calculate_match( + const char *haystack, + size_t haystack_len, VALUE needle, VALUE case_sensitive, - VALUE always_show_dot_files, - VALUE never_show_dot_files, - VALUE recurse, - long needle_bitmask, - match_t *haystack -); + VALUE recurse); #endif diff --git a/ruby/command-t/ext/command-t/matcher.c b/ruby/command-t/ext/command-t/matcher.c index b4ba078c..d34969c5 100644 --- a/ruby/command-t/ext/command-t/matcher.c +++ b/ruby/command-t/ext/command-t/matcher.c @@ -1,8 +1,11 @@ // Copyright 2010-present Greg Hurrell. All rights reserved. // Licensed under the terms of the BSD 2-clause license. +#include +#include #include /* for qsort() */ #include /* for strncmp() */ + #include "match.h" #include "matcher.h" #include "heap.h" @@ -15,30 +18,31 @@ #include /* for pthread_create, pthread_join etc */ #endif +static int cmp_path(const paths_t *a, const paths_t *b) { + if (a->length > b->length) return cmp_path(a, b->parent); + if (a->length < b->length) return cmp_path(a->parent, b); + + if (a->parent != b->parent) return cmp_path(a->parent, b->parent); + + size_t min_len = a->path_len < b->path_len? a->path_len : b->path_len; + int r = strncmp(a->path, b->path, min_len); + if (r) return r; + return a->path_len - b->path_len; +} + // Comparison function for use with qsort. int cmp_alpha(const void *a, const void *b) { - match_t a_match = *(match_t *)a; - match_t b_match = *(match_t *)b; - char *a_p = a_match.path; - long a_len = a_match.path_len; - char *b_p = b_match.path; - long b_len = b_match.path_len; - int order = 0; - - if (a_len > b_len) { - order = strncmp(a_p, b_p, b_len); - if (order == 0) - order = 1; // shorter string (b) wins. - } else if (a_len < b_len) { - order = strncmp(a_p, b_p, a_len); - if (order == 0) - order = -1; // shorter string (a) wins. - } else { - order = strncmp(a_p, b_p, a_len); - } + match_t *a_match = (match_t *)a; + match_t *b_match = (match_t *)b; + + paths_t *a_path = a_match->path; + paths_t *b_path = b_match->path; - return order; + if (!a_path->parent) return -1; + if (!b_path->parent) return 1; + + return cmp_path(a_path, b_path); } // Comparison function for use with qsort. @@ -81,94 +85,157 @@ VALUE CommandTMatcher_initialize(int argc, VALUE *argv, VALUE self) } typedef struct { - long thread_count; - long thread_index; + const char *needle; + uint32_t *needle_mask; + size_t needle_len; + size_t haystack_len; +} progress_t; + +typedef struct { + progress_t progress; long case_sensitive; + paths_t *paths; + size_t skip; + size_t scan; long limit; - matches_t *matches; + match_t *matches; VALUE needle; - VALUE always_show_dot_files; - VALUE never_show_dot_files; + int always_show_dot_files; + int never_show_dot_files; VALUE recurse; - long needle_bitmask; + + heap_t *heap; + char buf[PATHS_MAX_LEN]; } thread_args_t; -void *match_thread(void *thread_args) +/** Update match progress. + * + * Advance the match progress based on the passed segment. If + * progress->needle_len is zero when this function returns the current path and + * all subpaths "match". However note that subpaths may be hidden. + * + * @return true if any subpath could match. + */ +static int continue_match(thread_args_t *args, progress_t *progress, paths_t *path) { - long i; - float score; - heap_t *heap = NULL; - thread_args_t *args = (thread_args_t *)thread_args; + if (*progress->needle_mask & ~path->contained_mask) + return 0; + + for (size_t i = 0; i < path->path_len; ++i) { + char c = path->path[i]; + + // Hidden file? + if (c == '.' && ( + progress->haystack_len == 0 || + args->buf[progress->haystack_len - 1] == '/')) { + if (args->never_show_dot_files) + return 0; + if (progress->needle[0] != '.' && !args->always_show_dot_files) + return 0; + } - if (args->limit) { - // Reserve one extra slot so that we can do an insert-then-extract even - // when "full" (effectively allows use of min-heap to maintain a - // top-"limit" list of items). - heap = heap_new(args->limit + 1, cmp_score); - } + // Build up the path in the buffer. + args->buf[progress->haystack_len++] = c; - for ( - i = args->thread_index; - i < args->matches->len; - i += args->thread_count - ) { - if (args->needle_bitmask == UNSET_BITMASK) { - args->matches->matches[i].bitmask = UNSET_BITMASK; - } - args->matches->matches[i].score = calculate_match( - args->needle, - args->case_sensitive, - args->always_show_dot_files, - args->never_show_dot_files, - args->recurse, - args->needle_bitmask, - &args->matches->matches[i] - ); - if (heap) { - if (heap->count == args->limit) { - score = ((match_t *)HEAP_PEEK(heap))->score; - if (args->matches->matches[i].score >= score) { - heap_insert(heap, &args->matches->matches[i]); - (void)heap_extract(heap); - } - } else { - heap_insert(heap, &args->matches->matches[i]); + // Update match progress. + if (progress->needle_len) { + if (!args->case_sensitive) c = tolower(c); + + if (c == progress->needle[0]) { + progress->needle++; + progress->needle_len--; + progress->needle_mask++; + + if (*progress->needle_mask & ~path->contained_mask) + return 0; } } } - return heap; + return 1; } -long calculate_bitmask(VALUE string) { - char *str = RSTRING_PTR(string); - long len = RSTRING_LEN(string); - long i; - long mask = 0; - for (i = 0; i < len; i++) { - if (str[i] >= 'a' && str[i] <= 'z') { - mask |= (1 << (str[i] - 'a')); - } else if (str[i] >= 'A' && str[i] <= 'Z') { - mask |= (1 << (str[i] - 'A')); +void do_match(thread_args_t *args, paths_t *paths, progress_t progress) { + if (!continue_match(args, &progress, paths)) { + if (args->skip > paths->length) args->skip -= paths->length; + else { + size_t extra = paths->length - args->skip; + if (extra < args->scan) args->scan -= extra; + else args->scan = 0; } + return; + } + + if (!args->skip && paths->leaf && !progress.needle_len) { + match_t new_match = { + .path = paths, + .score = calculate_match( + args->buf, + progress.haystack_len, + args->needle, + args->case_sensitive, + args->recurse), + }; + + if (args->heap && args->heap->count == args->limit) { + // Note: We can just compare the score because we are iterating in + // alphabetical order so earlier items are preferred for equal score. + if (new_match.score > ((match_t*)HEAP_PEEK(args->heap))->score) { + match_t *buf = heap_extract(args->heap); + *buf = new_match; + heap_insert(args->heap, buf); + } + } else { + *args->matches = new_match; + if (args->heap) heap_insert(args->heap, args->matches); + + args->matches++; + } + } + if (paths->leaf && !args->skip && args->scan) if (!--args->scan) return; + if (paths->leaf && args->skip) args->skip -= 1; + + for (size_t i = 0; i < paths->subpaths_len; i++) { + paths_t *next = paths->subpaths[i]; + if (args->skip >= next->length) { + args->skip -= next->length; + continue; + } + do_match(args, next, progress); + if (!args->scan) return; + } +} + +void *match_thread(void *thread_args) +{ + thread_args_t *args = (thread_args_t *)thread_args; + + match_t *orig_matches = args->matches; + + if (args->limit) { + args->heap = heap_new(args->limit, cmp_score); + } + + do_match(args, args->paths, args->progress); + + size_t matches; + if (args->heap) { + matches = args->heap->count; + } else { + matches = args->matches - orig_matches; } - return mask; + + heap_free(args->heap); + + return (void*)matches; } VALUE CommandTMatcher_sorted_matches_for(int argc, VALUE *argv, VALUE self) { - long i, j, limit, thread_count; -#ifdef HAVE_PTHREAD_H - long err; - pthread_t *threads; -#endif - long needle_bitmask; - int use_heap; + size_t i, limit, thread_count, err; int sort; - matches_t *matches; - matches_t *heap_matches; - heap_t *heap; - thread_args_t *thread_args; + size_t matches_len = 0; + paths_t *paths; VALUE always_show_dot_files; VALUE case_sensitive; VALUE recurse; @@ -201,7 +268,6 @@ VALUE CommandTMatcher_sorted_matches_for(int argc, VALUE *argv, VALUE self) limit = NIL_P(limit_option) ? 15 : NUM2LONG(limit_option); sort = NIL_P(sort_option) || sort_option == Qtrue; - use_heap = limit && sort; needle = StringValue(needle); if (case_sensitive != Qtrue) @@ -210,127 +276,116 @@ VALUE CommandTMatcher_sorted_matches_for(int argc, VALUE *argv, VALUE self) if (ignore_spaces == Qtrue) needle = rb_funcall(needle, rb_intern("delete"), 1, rb_str_new2(" ")); + const char *needle_str = RSTRING_PTR(needle); + size_t needle_len = RSTRING_LEN(needle); + + uint32_t needle_masks[needle_len + 1]; + i = needle_len; + needle_masks[i] = 0; + while (i--) { + needle_masks[i] = needle_masks[i+1] | hash_char(needle_str[i]); + } + // Get unsorted matches. scanner = rb_iv_get(self, "@scanner"); paths_obj = rb_funcall(scanner, rb_intern("c_paths"), 0); - matches = paths_get_matches(paths_obj); - if (matches == NULL) { + paths = CommandTPaths_get_paths(paths_obj); + if (paths == NULL) { rb_raise(rb_eArgError, "null matches"); } - needle_bitmask = calculate_bitmask(needle); + if (!limit) limit = paths->length; - thread_count = NIL_P(threads_option) ? 1 : NUM2LONG(threads_option); - if (use_heap) { - heap_matches = malloc( - sizeof(matches_t) + (thread_count * limit + 1) * sizeof(match_t)); - if (!heap_matches) { - rb_raise(rb_eNoMemError, "memory allocation failed"); - } - heap_matches->len = 0; - } else { - heap_matches = matches; - } + size_t handled_paths = 0; #ifdef HAVE_PTHREAD_H -#define THREAD_THRESHOLD 1000 /* avoid the overhead of threading when search space is small */ - if (matches->len < THREAD_THRESHOLD) { - thread_count = 1; + thread_count = NIL_P(threads_option) ? 0 : NUM2LONG(threads_option); + size_t paths_per_thread = 10000; + if (thread_count) { + if (paths->length / thread_count < paths_per_thread) { + thread_count = paths->length / paths_per_thread; + } else { + paths_per_thread = paths->length / thread_count; + } } - threads = malloc(sizeof(pthread_t) * thread_count); - if (!threads) - rb_raise(rb_eNoMemError, "memory allocation failed"); -#endif - thread_args = malloc(sizeof(thread_args_t) * thread_count); - if (!thread_args) - rb_raise(rb_eNoMemError, "memory allocation failed"); - for (i = 0; i < thread_count; i++) { - thread_args[i].thread_count = thread_count; - thread_args[i].thread_index = i; - thread_args[i].case_sensitive = case_sensitive == Qtrue; - thread_args[i].matches = matches; - thread_args[i].limit = use_heap ? limit : 0; - thread_args[i].needle = needle; - thread_args[i].always_show_dot_files = always_show_dot_files; - thread_args[i].never_show_dot_files = never_show_dot_files; - thread_args[i].recurse = recurse; - thread_args[i].needle_bitmask = needle_bitmask; -#ifdef HAVE_PTHREAD_H - if (i == thread_count - 1) { -#endif - // For the last "worker", we'll just use the main thread. - heap = match_thread(&thread_args[i]); - if (heap) { - for (j = 0; j < heap->count; j++) { - heap_matches->matches[heap_matches->len++] = *(match_t *)heap->entries[j]; - } - heap_free(heap); - } -#ifdef HAVE_PTHREAD_H - } else { - err = pthread_create(&threads[i], NULL, match_thread, (void *)&thread_args[i]); - if (err != 0) { - rb_raise(rb_eSystemCallError, "pthread_create() failure (%d)", (int)err); - } + pthread_t threads[thread_count]; + match_t matches[limit * (thread_count + 1)]; + thread_args_t thread_args[thread_count]; + for (size_t i = 0; i < thread_count; ++i) { + thread_args[i] = (thread_args_t){ + .progress = (progress_t){ + .needle = needle_str, + .needle_len = needle_len, + .needle_mask = needle_masks, + }, + .case_sensitive = case_sensitive == Qtrue, + .paths = paths, + .matches = matches + limit*i, + .limit = limit, + .needle = needle, + .always_show_dot_files = always_show_dot_files == Qtrue, + .never_show_dot_files = never_show_dot_files == Qtrue, + .recurse = recurse, + .skip = handled_paths, + .scan = paths_per_thread, + }; + handled_paths += paths_per_thread; + + err = pthread_create(&threads[i], NULL, match_thread, (void *)&thread_args[i]); + if (err != 0) { + rb_raise(rb_eSystemCallError, "pthread_create() failure (%d)", (int)err); } -#endif } +#endif + + thread_args_t main_thread_arg = { + .progress = (progress_t){ + .needle = needle_str, + .needle_len = needle_len, + .needle_mask = needle_masks, + }, + .case_sensitive = case_sensitive == Qtrue, + .paths = paths, + .matches = matches + limit*thread_count, + .limit = limit, + .needle = needle, + .always_show_dot_files = always_show_dot_files == Qtrue, + .never_show_dot_files = never_show_dot_files == Qtrue, + .recurse = recurse, + .skip = handled_paths, + .scan = SIZE_MAX, + }; + size_t main_matches = (size_t)match_thread(&main_thread_arg); #ifdef HAVE_PTHREAD_H - for (i = 0; i < thread_count - 1; i++) { - err = pthread_join(threads[i], (void **)&heap); + for (i = 0; i < thread_count; i++) { + size_t match_count; + err = pthread_join(threads[i], (void *)&match_count); if (err != 0) { rb_raise(rb_eSystemCallError, "pthread_join() failure (%d)", (int)err); } - if (heap) { - for (j = 0; j < heap->count; j++) { - heap_matches->matches[heap_matches->len++] = *(match_t *)heap->entries[j]; - } - heap_free(heap); - } + memmove( + matches + matches_len, matches + limit*i, + match_count * sizeof(match_t)); + matches_len += match_count; } - free(threads); + memmove( + matches + matches_len, matches + limit*thread_count, + main_matches * sizeof(match_t)); #endif + matches_len += main_matches; if (sort) { - if ( - RSTRING_LEN(needle) == 0 || - (RSTRING_LEN(needle) == 1 && RSTRING_PTR(needle)[0] == '.') - ) { - // Alphabetic order if search string is only "" or "." - // TODO: make those semantics fully apply to heap case as well - // (they don't because the heap itself calls cmp_score, which means - // that the items which stay in the top [limit] may (will) be - // different). - qsort(heap_matches->matches, heap_matches->len, sizeof(match_t), cmp_alpha); - } else { - qsort(heap_matches->matches, heap_matches->len, sizeof(match_t), cmp_score); - } + qsort(matches, matches_len, sizeof(match_t), cmp_score); } results = rb_ary_new(); - if (limit == 0) - limit = matches->len; - for ( - i = 0; - i < heap_matches->len && limit > 0; - i++ - ) { - if (heap_matches->matches[i].score > 0.0) { - rb_funcall( - results, - rb_intern("push"), - 1, - rb_str_new( - heap_matches->matches[i].path, - heap_matches->matches[i].path_len)); - limit--; - } + if (matches_len > limit) matches_len = limit; + for (i = 0; i < matches_len; i++) { + VALUE path = paths_to_s(matches[i].path); + rb_funcall(results, rb_intern("push"), 1, path); } - if (use_heap) { - free(heap_matches); - } return results; } diff --git a/ruby/command-t/ext/command-t/scanner.c b/ruby/command-t/ext/command-t/scanner.c index 95a3a43a..8ce47f3e 100644 --- a/ruby/command-t/ext/command-t/scanner.c +++ b/ruby/command-t/ext/command-t/scanner.c @@ -11,48 +11,184 @@ #include "matcher.h" #include "ext.h" -typedef struct { - VALUE source; - size_t bufsize; - matches_t matches; -} paths_t; - -void mark_paths(paths_t *paths) { - rb_gc_mark(paths->source); +static void paths_free(paths_t *paths) { + for (size_t i = 0; i < paths->subpaths_len; i++) { + paths_free(paths->subpaths[i]); + } + if (paths->owned_path) + free((void*)paths->path); + free(paths); } -void free_paths(paths_t *paths) { - munmap(paths, paths->bufsize); +/** Find the longest common prefix of two strings. + * + * @returns The length of the prefix. + */ +static size_t common_prefix(paths_t *a, const char *b, size_t bl) { + size_t len = a->path_len > bl? bl : a->path_len; + for (size_t i = 0; i < len; ++i) { + if (a->path[i] != b[i]) return i; + } + return len; } -VALUE CommandTPaths_from_array(VALUE klass, VALUE source) { - Check_Type(source, T_ARRAY); - rb_obj_freeze(source); - - long len = RARRAY_LEN(source); - long bufsize = sizeof(paths_t) + len * sizeof(match_t); - paths_t *paths = mmap(NULL, bufsize, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, - -1, 0); - if (!paths) { +/** Allocate a root path. + */ +static paths_t *paths_new_root(void) { + paths_t *r = calloc(sizeof(paths_t), 1); + if (!r) { rb_raise(rb_eNoMemError, "memory allocation failed"); } - paths->bufsize = bufsize; - paths->matches.len = len; + return r; +} + +static int is_power_of_2(size_t n) { return !(n & (n - 1)); } + +/** Insert a path into a paths. + * + * It is the callers responsibility to ensure that the path should be put at the + * specified location. + * + * @param paths Object to insert into. + * @param i The index at which the path should reside after being inserted. + * @param path The suffix of the path to be inserted. + * @param len The length of path. + */ +static void _paths_insert_at(paths_t *paths, size_t i, const char *path, size_t len) { + // The capacity of .subpaths is implied by .subpaths_len. Basically round up + // .subpaths len to find the current capacity of 0, 2, 4, ... + if (!paths->subpaths_len) paths->subpaths = malloc(2*sizeof(paths_t)); + else if (paths->subpaths_len < 2) {} // len = 1 -> capacity = 2 + else if (is_power_of_2(paths->subpaths_len)) { + size_t capacity = paths->subpaths_len * 2; + if (!capacity) capacity = 2; + + paths->subpaths = realloc(paths->subpaths, capacity*sizeof(paths_t*)); + } + + // Make room. + memmove(paths->subpaths + i + 1, paths->subpaths + i, + sizeof(paths_t*)*(paths->subpaths_len - i)); + paths->subpaths_len++; + + // Create and insert. + paths_t *new = malloc(sizeof(paths_t)); + *new = (paths_t){ + .parent = paths, + .length = 1, + .path = strndup(path, len), + .path_len = len, + .owned_path = 1, + .leaf = 1, + .contained_mask = contained_mask(path, len), + }; + paths->subpaths[i] = new; +} + +/** Add a new path. + * + * @param paths The paths collection to insert into. + * @param path The path to insert. + * @param len The length (in bytes) of path. + */ +static void paths_push(paths_t *paths, const char *path, size_t len) { + paths->length++; + paths->contained_mask |= contained_mask(path, len); + + if (!len) { + paths->leaf = 1; + return; + } + + // Iterate backwards because the common case is adding in order. + for (size_t i = paths->subpaths_len; i--; ) { + paths_t *subpath = paths->subpaths[i]; - // The source must stay around as the backing string will be shared. - paths->source = source; + if (subpath->path[0] == path[0]) { + // First character matches, merge into this entry. + size_t shared = common_prefix(subpath, path, len); + if (shared == subpath->path_len) { + // Goes inside the subpath. + return paths_push(subpath, path + shared, len - shared); + } + + paths_t *new = malloc(sizeof(paths_t)); + if (shared == len) { + // Subpath should be inside this one. + *new = (paths_t){ + .parent = paths, + .length = subpath->length + 1, + .path = subpath->path, + .path_len = shared, + .contained_mask = subpath->contained_mask, + .leaf = 1, + .owned_path = subpath->owned_path, + .subpaths_len = 1, + .subpaths = malloc(2*sizeof(paths_t*)), + }; + new->subpaths[0] = subpath; + } else { + // Create a fork + uint32_t new_chars = contained_mask(path + shared, len - shared); + *new = (paths_t){ + .parent = paths, + .length = subpath->length + 1, + .path = subpath->path, + .path_len = shared, + .contained_mask = subpath->contained_mask | new_chars, + .owned_path = subpath->owned_path, + .subpaths_len = 2, + .subpaths = malloc(2*sizeof(paths_t*)), + }; + paths_t *leaf = malloc(sizeof(paths_t)); + *leaf = (paths_t){ + .parent = new, + .length = 1, + .path = strndup(path + shared, len - shared), + .path_len = len - shared, + .contained_mask = new_chars, + .leaf = 1, + .owned_path = 1, + }; + if (subpath->path[shared] < path[shared]) { + new->subpaths[0] = subpath; + new->subpaths[1] = leaf; + } else { + new->subpaths[0] = leaf; + new->subpaths[1] = subpath; + } + } + paths->subpaths[i] = new; + subpath->parent = new; + subpath->path += shared; + subpath->path_len -= shared; + subpath->owned_path = 0; + // Note: Ideally we would update subpath->contained_mask to not + // include path[0..shared] but that would require a traversal of all + // parents. This value is still "correct" just too conservative. + return; + } else if (subpath->path[0] < path[0]) { + return _paths_insert_at(paths, i+1, path, len); + } + } + + // Before any subpath, so insert it in front. + _paths_insert_at(paths, 0, path, len); +} + +VALUE CommandTPaths_from_array(VALUE klass, VALUE source) { + Check_Type(source, T_ARRAY); + + paths_t *paths = paths_new_root(); + + long len = RARRAY_LEN(source); VALUE *source_array = RARRAY_PTR(source); - while (len--) { - rb_obj_freeze(source_array[len]); - paths->matches.matches[len].path = RSTRING_PTR(source_array[len]); - paths->matches.matches[len].path_len = RSTRING_LEN(source_array[len]); - paths->matches.matches[len].bitmask = UNSET_BITMASK; + for (long i = 0; i < len; ++i) { + paths_push(paths, RSTRING_PTR(source_array[i]), RSTRING_LEN(source_array[i])); } - return Data_Wrap_Struct(klass, mark_paths, free_paths, paths); + return Data_Wrap_Struct(klass, NULL, paths_free, paths); } VALUE CommandTPaths_from_fd(VALUE klass, VALUE source, VALUE term, VALUE opt) { @@ -80,28 +216,16 @@ VALUE CommandTPaths_from_fd(VALUE klass, VALUE source, VALUE term, VALUE opt) { scratch = rb_str_new(NULL, 0); } - size_t buffer_len = 1099511627776; // 1TiB should be enough for anyone. - size_t paths_len = sizeof(paths_t) + (max_files) * sizeof(match_t); - size_t total_len = buffer_len + paths_len; - - paths_t *paths = mmap(NULL, total_len, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, - -1, 0); - char *buffer = (char*)paths + paths_len; - if (paths == MAP_FAILED) { - rb_sys_fail(strerror(errno)); - } - paths->bufsize = total_len; - paths->source = Qnil; + paths_t *paths = paths_new_root(); + char buffer[PATHS_MAX_LEN]; char *start = buffer; char *end = buffer; - ssize_t count = 1; + size_t count; long match_count = 0; - while ((count = read(fd, end, 4096)) != 0) { - if (count < 0) { - munmap(paths, total_len); + while ((count = read(fd, end, sizeof(buffer) - (end - start))) != 0) { + if (count <= 0) { + paths_free(paths); rb_raise(rb_eRuntimeError, "read returned error %s", strerror(errno)); } @@ -115,6 +239,12 @@ VALUE CommandTPaths_from_fd(VALUE klass, VALUE source, VALUE term, VALUE opt) { char *path = start + drop; int len = next_end - start - drop; + if (next_end-start < drop) + rb_raise(rb_eRuntimeError, + "Terminator is less then drop away (%lu - %lu) '%.*s'.", + next_end-start, drop, + (int)(next_end-start), start); + start = next_end + 1; if (filter != Qnil) { @@ -126,49 +256,86 @@ VALUE CommandTPaths_from_fd(VALUE klass, VALUE source, VALUE term, VALUE opt) { } } - paths->matches.matches[match_count].path = path; - paths->matches.matches[match_count].path_len = len; - paths->matches.matches[match_count].bitmask = UNSET_BITMASK; - match_count++; + paths_push(paths, path, len); - if (match_count >= max_files) { + if (paths->length >= (size_t)max_files) { goto done; /* break two levels */ } if (update != Qnil && match_count >= next_update) { next_update = NUM2LONG(rb_funcall(update, call, 1, LONG2NUM(match_count))); } } + + size_t remaining = end - start; + memmove(buffer, start, remaining); + start = buffer; + end = start + remaining; } done: - paths->matches.len = match_count; - if (start < end) { rb_raise(rb_eRuntimeError, "Last byte of string must be the terminator."); } - return Data_Wrap_Struct(klass, mark_paths, free_paths, paths); -} - -VALUE CommandTPaths_to_a(VALUE self) { - return matches_to_a(paths_get_matches(self)); + return Data_Wrap_Struct(klass, NULL, paths_free, paths); } -matches_t *paths_get_matches(VALUE self) { +paths_t *CommandTPaths_get_paths(VALUE self) { paths_t *paths; Data_Get_Struct(self, paths_t, paths); - return &paths->matches; + return paths; } -VALUE matches_to_a(matches_t *matches) { - VALUE r = rb_ary_new(); - VALUE push = rb_intern("push"); - int i; - for (i = 0; i < matches->len; i++) { - rb_funcall(r, push, 1, - rb_str_new( - matches->matches[i].path, - matches->matches[i].path_len)); +static void paths_push_to_a(VALUE array, VALUE prefix, paths_t *paths) { + size_t starting_len = RSTRING_LEN(prefix); + + rb_str_buf_cat(prefix, paths->path, paths->path_len); + + if (paths->leaf) { + // Force a copy. + VALUE leaf = rb_str_new(RSTRING_PTR(prefix), RSTRING_LEN(prefix)); + rb_ary_push(array, leaf); } + + for (size_t i = 0; i < paths->subpaths_len; ++i) { + paths_push_to_a(array, prefix, paths->subpaths[i]); + } + + rb_str_set_len(prefix, starting_len); +} + +VALUE CommandTPaths_to_a(VALUE self) { + VALUE r = rb_ary_new(); + VALUE path = rb_str_buf_new(0); + paths_push_to_a(r, path, CommandTPaths_get_paths(self)); return r; } + +static void indent(size_t depth) { while(depth--) fprintf(stderr, "| "); } + +static void paths_dump_depth(const paths_t *paths, size_t depth) { + indent(depth); fprintf(stderr, "PATHPATHPATH: %.*s\n", paths->path_len, paths->path); + indent(depth); fprintf(stderr, "leaf: %u, owned: %u, mask: %#08x\n", + paths->leaf, paths->owned_path, paths->contained_mask); + indent(depth); fprintf(stderr, "subpaths: %ld, total: %ld\n", paths->subpaths_len, paths->length); + for (size_t i = 0; i < paths->subpaths_len; ++i) + paths_dump_depth(paths->subpaths[i], depth + 1); +} + +void paths_dump(const paths_t *paths) { + paths_dump_depth(paths, 0); +} + +static VALUE paths_to_s_internal(const paths_t *paths, size_t len) { + if (!paths->parent) { + return rb_str_buf_new(len); + } + + VALUE buf = paths_to_s_internal(paths->parent, len + paths->path_len); + rb_str_buf_cat(buf, paths->path, paths->path_len); + return buf; +} + +VALUE paths_to_s(const paths_t *paths) { + return paths_to_s_internal(paths, 0); +} diff --git a/ruby/command-t/ext/command-t/scanner.h b/ruby/command-t/ext/command-t/scanner.h index 128fc7fe..01c22a6c 100644 --- a/ruby/command-t/ext/command-t/scanner.h +++ b/ruby/command-t/ext/command-t/scanner.h @@ -1,11 +1,76 @@ // Copyright 2010-present Greg Hurrell. All rights reserved. // Licensed under the terms of the BSD 2-clause license. -#include "match.h" +#ifndef SCANNER_H +#define SCANNER_H + +#include + +#include + +// The maximum length of any given path. +#define PATHS_MAX_LEN 4096 + +/** A set of paths. + * + * Internally they are stored as a tree with common prefixes shared. This + * class also has some additional metadata to aid searching. + */ +typedef struct paths_t { + struct paths_t *parent; /// The parent path or NULL if this is the root. + size_t length; /// The number of contained paths. + + struct paths_t **subpaths; /// Child paths, sorted in ascending order. + size_t subpaths_len; /// The number of children. + + char *path; /// The string representing this path component. + uint32_t contained_mask; /// A bitmap of chars contained by this path. + uint16_t path_len; /// The size of path in bytes. + uint8_t leaf: 1; /// If set this path is in the set. + + /** If this object owns the string pointed to by path. + * + * Note that even if a string is owned it will be referenced by subpaths. + */ + uint8_t owned_path: 1; +} paths_t; + +static_assert(PATHS_MAX_LEN < UINT16_MAX, "paths_t.path_len is too small."); extern VALUE CommandTPaths_from_array(VALUE, VALUE); extern VALUE CommandTPaths_from_fd(VALUE, VALUE, VALUE, VALUE); extern VALUE CommandTPaths_to_a(VALUE); -extern matches_t *paths_get_matches(VALUE); -extern VALUE matches_to_a(matches_t *); +static inline uint32_t hash_char(char c) { + if ('A' <= c && c <= 'Z') + return 1 << (c - 'A'); + if ('a' <= c && c <= 'z') + return 1 << (c - 'a'); + return 0; +} + +static inline uint32_t contained_mask(const char *s, size_t len) { + uint32_t r = 0; + while (len--) { + char c = *s++; + r |= hash_char(c); + } + return r; +} + +extern paths_t *CommandTPaths_get_paths(VALUE); + +/** Print the given path. + * + * This prints the given path and all ancestors. Note that it does *not* print + * any descendants. + */ +extern VALUE paths_to_s(const paths_t *); + +/** Print a path to stderr. + * + * This prints a debugging representation of the passed path to stderr. + */ +extern void paths_dump(const paths_t *); + +#endif diff --git a/spec/command-t/matcher_spec.rb b/spec/command-t/matcher_spec.rb index 9c91e073..66cc954c 100644 --- a/spec/command-t/matcher_spec.rb +++ b/spec/command-t/matcher_spec.rb @@ -273,7 +273,6 @@ def ordered_matches(paths, query) end it "doesn't show a dotfile just because there was a match at index 0" do - pending 'fix' matcher = matcher(*%w[src/.flowconfig]) expect(matcher.sorted_matches_for('s')).to eq([]) end @@ -298,5 +297,16 @@ def ordered_matches(paths, query) app/assets/components/PrivacyPage/index.jsx ]) end + + it 'correctly matches when added in reverse order' do + matcher = matcher(*%w[ + ruby/command-t/ext/command-t/scanner.c + ruby/command-t/ext/command-t/depend + ruby/command-t/command-t.gemspec + ]) + expect(matcher.sorted_matches_for('scanner')).to match_array %w[ + ruby/command-t/ext/command-t/scanner.c + ] + end end end