From d7c39b61c787122d127a17b5ab2992180e670080 Mon Sep 17 00:00:00 2001 From: Brian McGee Date: Thu, 17 Oct 2024 11:54:09 +0100 Subject: [PATCH] feat: improve change detection Previously, we were storing the last `modtime` and `size` for a given path in `boltdb`. This was used to determine if a file had changed and whether we should format it. In addition, if a formatter's executable changed (`modtime` or `size`), we would delete all path entries in the database before processing, thereby forcing each file to be formatted. This commit introduces a new approach to change detection, one which takes into account changes to the underlying file, the formatter's executable, _and_ the formatter's configuration. Now, when deciding if we should format a file, we do the following: - Hash each matching formatter, in sequence, using its `name`, `options`, `priority` as well as the `modtime` and `size` of its executable. This is pre-computed on a per-pipeline basis. - We then add the file's `modtime` and `size` to generate a format signature. You can think of this signature as a unique representation of what we are about to do with the file. - The format signature is then compared with a cache entry (if available). - If the signatures match, we have already applied this sequence of formatters, with these particular options etc. to this file when it had this `modtime` and `size`, so there is no more processing to be done. - If the signatures do no match, we should format the file, recording the new format signature in the cache when we are finished. This approach is simpler in terms of storage, and has the added benefit of finer grained change detection versus the brute force cache busting we were doing before. In terms of performance impact, with the pre-computing of hashes per-pipeline and the simpler storage schema, there appears to have been no significant impact. Manual testing with [nixpkgs](https://github.com/nixos/nixpkgs) shows comparable run times for both hot and cold caches. > [!NOTE] > Since this changes the database schema, rather than implementing some form of migration > logic to remove the old buckets and so on, I decided to upgrade the hash algorithm we > use when determining the filename for the db file. > > Previously, we were using `sha1`, matching the behaviour from `1.0`. > Now we use `sha256`, which results in a slightly longer db name, but has the benefit of > ensuring a new db instance will be created on first invocation, as well as making > [gosec](https://golangci-lint.run/usage/linters/#gosec) happy. Closes #455 Signed-off-by: Brian McGee --- cmd/format/format.go | 10 - cmd/root_test.go | 53 +++- format/composite.go | 193 +++++++++++++ format/format.go | 407 ---------------------------- format/formatter.go | 29 +- format/formatter_test.go | 146 +++++++++- format/scheduler.go | 260 ++++++++++++++++++ format/task.go | 45 --- go.mod | 5 +- go.sum | 4 - nix/packages/treefmt/gomod2nix.toml | 9 +- test/{temp.go => test.go} | 19 +- walk/cache/bucket.go | 100 ------- walk/cache/cache.go | 53 ++-- walk/cached.go | 40 +-- walk/cached_test.go | 137 ---------- walk/filesystem.go | 2 +- walk/git.go | 2 +- walk/walk.go | 56 +++- 19 files changed, 774 insertions(+), 796 deletions(-) create mode 100644 format/composite.go delete mode 100644 format/format.go create mode 100644 format/scheduler.go delete mode 100644 format/task.go rename test/{temp.go => test.go} (71%) delete mode 100644 walk/cache/bucket.go delete mode 100644 walk/cached_test.go diff --git a/cmd/format/format.go b/cmd/format/format.go index 40dcc094..829c91a6 100644 --- a/cmd/format/format.go +++ b/cmd/format/format.go @@ -77,9 +77,6 @@ func Run(v *viper.Viper, statz *stats.Stats, cmd *cobra.Command, paths []string) }() } - // set a prefix on the default logger - log.SetPrefix("format") - var db *bolt.DB // open the db unless --no-cache was specified @@ -161,13 +158,6 @@ func Run(v *viper.Viper, statz *stats.Stats, cmd *cobra.Command, paths []string) return fmt.Errorf("failed to create composite formatter: %w", err) } - if db != nil { - // compare formatters with the db, busting the cache if the formatters have changed - if err := formatter.BustCache(db); err != nil { - return fmt.Errorf("failed to compare formatters: %w", err) - } - } - // create a new walker for traversing the paths walker, err := walk.NewCompositeReader(walkType, cfg.TreeRoot, paths, db, statz) if err != nil { diff --git a/cmd/root_test.go b/cmd/root_test.go index 1425dd32..1ad40468 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -56,7 +56,7 @@ func TestOnUnmatched(t *testing.T) { checkOutput := func(level string, output []byte) { for _, p := range paths { - as.Contains(string(output), fmt.Sprintf("%s format: no formatter for path: %s", level, p)) + as.Contains(string(output), fmt.Sprintf("%s no formatter for path: %s", level, p)) } } @@ -605,7 +605,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { configPath := tempDir + "/touch.toml" // symlink some formatters into temp dir, so we can mess with their mod times - binPath := tempDir + "/bin" + binPath := filepath.Join(tempDir, "bin") as.NoError(os.Mkdir(binPath, 0o755)) binaries := []string{"black", "elm-format", "gofmt"} @@ -613,7 +613,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { for _, name := range binaries { src, err := exec.LookPath(name) as.NoError(err) - as.NoError(os.Symlink(src, binPath+"/"+name)) + as.NoError(os.Symlink(src, filepath.Join(binPath, name))) } // prepend our test bin directory to PATH @@ -647,7 +647,8 @@ func TestBustCacheOnFormatterChange(t *testing.T) { }) // tweak mod time of elm formatter - as.NoError(test.RecreateSymlink(t, binPath+"/"+"elm-format")) + newTime := time.Now().Add(-time.Minute) + as.NoError(test.Lutimes(t, filepath.Join(binPath, "elm-format"), newTime, newTime)) _, statz, err = treefmt(t, args...) as.NoError(err) @@ -655,7 +656,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { assertStats(t, as, statz, map[stats.Type]int{ stats.Traversed: 32, stats.Matched: 3, - stats.Formatted: 3, + stats.Formatted: 1, stats.Changed: 0, }) @@ -671,7 +672,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { }) // tweak mod time of python formatter - as.NoError(test.RecreateSymlink(t, binPath+"/"+"black")) + as.NoError(test.Lutimes(t, filepath.Join(binPath, "black"), newTime, newTime)) _, statz, err = treefmt(t, args...) as.NoError(err) @@ -679,7 +680,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { assertStats(t, as, statz, map[stats.Type]int{ stats.Traversed: 32, stats.Matched: 3, - stats.Formatted: 3, + stats.Formatted: 2, stats.Changed: 0, }) @@ -695,11 +696,12 @@ func TestBustCacheOnFormatterChange(t *testing.T) { }) // add go formatter - cfg.FormatterConfigs["go"] = &config.Formatter{ + goFormatter := &config.Formatter{ Command: "gofmt", Options: []string{"-w"}, Includes: []string{"*.go"}, } + cfg.FormatterConfigs["go"] = goFormatter test.WriteConfig(t, configPath, cfg) _, statz, err = treefmt(t, args...) @@ -708,7 +710,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { assertStats(t, as, statz, map[stats.Type]int{ stats.Traversed: 32, stats.Matched: 4, - stats.Formatted: 4, + stats.Formatted: 1, stats.Changed: 0, }) @@ -723,6 +725,35 @@ func TestBustCacheOnFormatterChange(t *testing.T) { stats.Changed: 0, }) + // tweak go formatter options + goFormatter.Options = []string{"-w", "-s"} + + test.WriteConfig(t, configPath, cfg) + + _, statz, err = treefmt(t, args...) + as.NoError(err) + + assertStats(t, as, statz, map[stats.Type]int{ + stats.Traversed: 32, + stats.Matched: 4, + stats.Formatted: 1, + stats.Changed: 0, + }) + + // add a priority + cfg.FormatterConfigs["go"].Priority = 3 + test.WriteConfig(t, configPath, cfg) + + _, statz, err = treefmt(t, args...) + as.NoError(err) + + assertStats(t, as, statz, map[stats.Type]int{ + stats.Traversed: 32, + stats.Matched: 4, + stats.Formatted: 1, + stats.Changed: 0, + }) + // remove python formatter delete(cfg.FormatterConfigs, "python") test.WriteConfig(t, configPath, cfg) @@ -733,7 +764,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { assertStats(t, as, statz, map[stats.Type]int{ stats.Traversed: 32, stats.Matched: 2, - stats.Formatted: 2, + stats.Formatted: 0, stats.Changed: 0, }) @@ -758,7 +789,7 @@ func TestBustCacheOnFormatterChange(t *testing.T) { assertStats(t, as, statz, map[stats.Type]int{ stats.Traversed: 32, stats.Matched: 1, - stats.Formatted: 1, + stats.Formatted: 0, stats.Changed: 0, }) diff --git a/format/composite.go b/format/composite.go new file mode 100644 index 00000000..8b68a393 --- /dev/null +++ b/format/composite.go @@ -0,0 +1,193 @@ +package format + +import ( + "context" + "crypto/sha256" + "errors" + "fmt" + "os" + "slices" + + "github.com/charmbracelet/log" + "github.com/gobwas/glob" + "github.com/numtide/treefmt/config" + "github.com/numtide/treefmt/stats" + "github.com/numtide/treefmt/walk" + "mvdan.cc/sh/v3/expand" +) + +const ( + batchKeySeparator = ":" +) + +var ErrFormattingFailures = errors.New("formatting failures detected") + +// CompositeFormatter handles the application of multiple Formatter instances based on global excludes and individual +// formatter configuration. +type CompositeFormatter struct { + cfg *config.Config + stats *stats.Stats + globalExcludes []glob.Glob + + unmatchedLevel log.Level + + scheduler *scheduler + formatters map[string]*Formatter +} + +// match filters the file against global excludes and returns a list of formatters that want to process the file. +func (c *CompositeFormatter) match(file *walk.File) []*Formatter { + // first check if this file has been globally excluded + if pathMatches(file.RelPath, c.globalExcludes) { + log.Debugf("path matched global excludes: %s", file.RelPath) + + return nil + } + + // a list of formatters that match this file + var matches []*Formatter + + // iterate the formatters, recording which are interested in this file + for _, formatter := range c.formatters { + if formatter.Wants(file) { + matches = append(matches, formatter) + } + } + + return matches +} + +// Apply applies the configured formatters to the given files. +func (c *CompositeFormatter) Apply(ctx context.Context, files []*walk.File) error { + var toRelease []*walk.File + + for _, file := range files { + matches := c.match(file) // match the file against the formatters + + // check if there were no matches + if len(matches) == 0 { + // log that there was no match, exiting with an error if the unmatched level was set to fatal + if c.unmatchedLevel == log.FatalLevel { + return fmt.Errorf("no formatter for path: %s", file.RelPath) + } + + log.Logf(c.unmatchedLevel, "no formatter for path: %s", file.RelPath) + + // no further processing to be done, append to the release list + toRelease = append(toRelease, file) + + // continue to the next file + continue + } + + // record there was a match + c.stats.Add(stats.Matched, 1) + + if accepted, err := c.scheduler.submit(ctx, file, matches); err != nil { + return fmt.Errorf("failed to schedule file: %w", err) + } else if !accepted { + // if a file wasn't accepted, it means there was no formatting to perform + toRelease = append(toRelease, file) + } + } + + // release files that require no further processing + // we set noCache to true as there's no need to update the cache, since we skipped those files + releaseCtx := walk.SetNoCache(ctx, true) + + for _, file := range toRelease { + if err := file.Release(releaseCtx); err != nil { + return fmt.Errorf("failed to release file: %w", err) + } + } + + return nil +} + +// signature takes anything that might affect the paths to be traversed, +// or how they are traversed, and adds it to a sha256 hash. +// This can be used to determine if there has been a material change in config. +func (c *CompositeFormatter) signature() (signature, error) { + h := sha256.New() + + // sort formatters deterministically + formatters := make([]*Formatter, 0, len(c.formatters)) + for _, f := range c.formatters { + formatters = append(formatters, f) + } + + slices.SortFunc(formatters, formatterSortFunc) + + // apply them to the hash + for _, f := range formatters { + if err := f.Hash(h); err != nil { + return nil, fmt.Errorf("failed to hash formatter: %w", err) + } + } + + // finalize + return h.Sum(nil), nil +} + +// Close finalizes the processing of the CompositeFormatter, ensuring that any remaining batches are applied and +// all formatters have completed their tasks. It returns an error if any formatting failures were detected. +func (c *CompositeFormatter) Close(ctx context.Context) error { + return c.scheduler.close(ctx) +} + +func NewCompositeFormatter( + cfg *config.Config, + statz *stats.Stats, + batchSize int, +) (*CompositeFormatter, error) { + // compile global exclude globs + globalExcludes, err := compileGlobs(cfg.Excludes) + if err != nil { + return nil, fmt.Errorf("failed to compile global excludes: %w", err) + } + + // parse unmatched log level + unmatchedLevel, err := log.ParseLevel(cfg.OnUnmatched) + if err != nil { + return nil, fmt.Errorf("invalid on-unmatched value: %w", err) + } + + // create a composite formatter, adjusting the change logging based on --fail-on-change + changeLevel := log.DebugLevel + if cfg.FailOnChange { + changeLevel = log.ErrorLevel + } + + // create formatters + formatters := make(map[string]*Formatter) + + env := expand.ListEnviron(os.Environ()...) + + for name, formatterCfg := range cfg.FormatterConfigs { + formatter, err := newFormatter(name, cfg.TreeRoot, env, formatterCfg) + + if errors.Is(err, ErrCommandNotFound) && cfg.AllowMissingFormatter { + log.Debugf("formatter command not found: %v", name) + + continue + } else if err != nil { + return nil, fmt.Errorf("failed to initialise formatter %v: %w", name, err) + } + + // store formatter by name + formatters[name] = formatter + } + + // create a scheduler for carrying out the actual formatting + scheduler := newScheduler(statz, batchSize, changeLevel, formatters) + + return &CompositeFormatter{ + cfg: cfg, + stats: statz, + globalExcludes: globalExcludes, + unmatchedLevel: unmatchedLevel, + + scheduler: scheduler, + formatters: formatters, + }, nil +} diff --git a/format/format.go b/format/format.go deleted file mode 100644 index 86528122..00000000 --- a/format/format.go +++ /dev/null @@ -1,407 +0,0 @@ -package format - -import ( - "cmp" - "context" - "errors" - "fmt" - "os" - "runtime" - "slices" - "strings" - "sync/atomic" - "time" - - "github.com/charmbracelet/log" - "github.com/gobwas/glob" - "github.com/numtide/treefmt/config" - "github.com/numtide/treefmt/stats" - "github.com/numtide/treefmt/walk" - "github.com/numtide/treefmt/walk/cache" - bolt "go.etcd.io/bbolt" - "golang.org/x/sync/errgroup" - "mvdan.cc/sh/v3/expand" -) - -const ( - batchKeySeparator = ":" -) - -var ErrFormattingFailures = errors.New("formatting failures detected") - -// batchKey represents the unique sequence of formatters to be applied to a batch of files. -// For example, "deadnix:statix:nixpkgs-fmt" indicates that deadnix should be applied first, statix second and -// nixpkgs-fmt third. -// Files are batched based on their formatting sequence, as determined by the priority and includes/excludes in the -// formatter configuration. -type batchKey string - -// sequence returns the list of formatters, by name, to be applied to a batch of files. -func (b batchKey) sequence() []string { - return strings.Split(string(b), batchKeySeparator) -} - -func newBatchKey(formatters []*Formatter) batchKey { - components := make([]string, 0, len(formatters)) - for _, f := range formatters { - components = append(components, f.Name()) - } - - return batchKey(strings.Join(components, batchKeySeparator)) -} - -// batchMap maintains a mapping between batchKey and a slice of pointers to walk.File, used to organize files into -// batches based on the sequence of formatters to be applied. -type batchMap map[batchKey][]*walk.File - -func formatterSortFunc(a, b *Formatter) int { - // sort by priority in ascending order - priorityA := a.Priority() - priorityB := b.Priority() - - result := priorityA - priorityB - if result == 0 { - // formatters with the same priority are sorted lexicographically to ensure a deterministic outcome - result = cmp.Compare(a.Name(), b.Name()) - } - - return result -} - -// Append adds a file to the batch corresponding to the given sequence of formatters and returns the updated batch. -func (b batchMap) Append(file *walk.File, matches []*Formatter) (key batchKey, batch []*walk.File) { - slices.SortFunc(matches, formatterSortFunc) - - // construct a batch key based on the sequence of formatters - key = newBatchKey(matches) - - // append to the batch - b[key] = append(b[key], file) - - // return the batch - return key, b[key] -} - -// CompositeFormatter handles the application of multiple Formatter instances based on global excludes and individual -// formatter configuration. -type CompositeFormatter struct { - stats *stats.Stats - batchSize int - globalExcludes []glob.Glob - - changeLevel log.Level - unmatchedLevel log.Level - - formatters map[string]*Formatter - - eg *errgroup.Group - batches batchMap - - // formatError indicates if at least one formatting error occurred - formatError *atomic.Bool -} - -func (c *CompositeFormatter) apply(ctx context.Context, key batchKey, batch []*walk.File) { - c.eg.Go(func() error { - var formatErrors []error - - // apply the formatters in sequence - for _, name := range key.sequence() { - formatter := c.formatters[name] - - if err := formatter.Apply(ctx, batch); err != nil { - formatErrors = append(formatErrors, err) - } - } - - // record if a format error occurred - hasErrors := len(formatErrors) > 0 - c.formatError.Store(hasErrors) - - if !hasErrors { - // record that the file was formatted - c.stats.Add(stats.Formatted, len(batch)) - } - - // Create a release context. - // We set no-cache based on whether any formatting errors occurred in this batch. - // This is to communicate with any caching layer, if used when reading files for this batch, that it should not - // update the state of any file in this batch, as we want to re-process them in later invocations. - releaseCtx := walk.SetNoCache(ctx, hasErrors) - - // post-processing - for _, file := range batch { - // check if the file has changed - changed, newInfo, err := file.Stat() - if err != nil { - return err - } - - if changed { - // record that a change in the underlying file occurred - c.stats.Add(stats.Changed, 1) - - log.Log( - c.changeLevel, "file has changed", - "path", file.RelPath, - "prev_size", file.Info.Size(), - "prev_mod_time", file.Info.ModTime().Truncate(time.Second), - "current_size", newInfo.Size(), - "current_mod_time", newInfo.ModTime().Truncate(time.Second), - ) - - // update the file info - file.Info = newInfo - } - - // release the file as there is no further processing to be done on it - if err := file.Release(releaseCtx); err != nil { - return fmt.Errorf("failed to release file: %w", err) - } - } - - return nil - }) -} - -// match filters the file against global excludes and returns a list of formatters that want to process the file. -func (c *CompositeFormatter) match(file *walk.File) []*Formatter { - // first check if this file has been globally excluded - if pathMatches(file.RelPath, c.globalExcludes) { - log.Debugf("path matched global excludes: %s", file.RelPath) - - return nil - } - - // a list of formatters that match this file - var matches []*Formatter - - // otherwise, check if any formatters are interested in it - for _, formatter := range c.formatters { - if formatter.Wants(file) { - matches = append(matches, formatter) - } - } - - return matches -} - -// Apply applies the configured formatters to the given files. -func (c *CompositeFormatter) Apply(ctx context.Context, files []*walk.File) error { - var toRelease []*walk.File - - for _, file := range files { - matches := c.match(file) // match the file against the formatters - - // check if there were no matches - if len(matches) == 0 { - // log that there was no match, exiting with an error if the unmatched level was set to fatal - if c.unmatchedLevel == log.FatalLevel { - return fmt.Errorf("no formatter for path: %s", file.RelPath) - } - - log.Logf(c.unmatchedLevel, "no formatter for path: %s", file.RelPath) - - // no further processing to be done, append to the release list - toRelease = append(toRelease, file) - - // continue to the next file - continue - } - - // record there was a match - c.stats.Add(stats.Matched, 1) - - // check if the file is new or has changed when compared to the cache entry - if file.Cache == nil || file.Cache.HasChanged(file.Info) { - // add this file to a batch and if it's full, apply formatters to the batch - if key, batch := c.batches.Append(file, matches); len(batch) == c.batchSize { - c.apply(ctx, newBatchKey(matches), batch) - // reset the batch - c.batches[key] = make([]*walk.File, 0, c.batchSize) - } - } else { - // no further processing to be done, append to the release list - toRelease = append(toRelease, file) - } - } - - // release files that require no further processing - // we set noCache to true as there's no need to update the cache, since we skipped those files - releaseCtx := walk.SetNoCache(ctx, true) - - for _, file := range toRelease { - if err := file.Release(releaseCtx); err != nil { - return fmt.Errorf("failed to release file: %w", err) - } - } - - return nil -} - -// BustCache compares the currently configured formatters with their respective entries in the db. -// If a formatter was added, removed or modified, we clear any path entries from the cache, ensuring that all paths -// get formatted with the most recent formatter set. -func (c *CompositeFormatter) BustCache(db *bolt.DB) error { - return db.Update(func(tx *bolt.Tx) error { - clearPaths := false - - pathsBucket, err := cache.BucketPaths(tx) - if err != nil { - return fmt.Errorf("failed to get paths bucket from cache: %w", err) - } - - formattersBucket, err := cache.BucketFormatters(tx) - if err != nil { - return fmt.Errorf("failed to get formatters bucket from cache: %w", err) - } - - // check for any newly configured or modified formatters - for name, formatter := range c.formatters { - stat, err := os.Lstat(formatter.Executable()) - if err != nil { - return fmt.Errorf("failed to stat formatter executable %v: %w", formatter.Executable(), err) - } - - entry, err := formattersBucket.Get(name) - if !(err == nil || errors.Is(err, cache.ErrKeyNotFound)) { - return fmt.Errorf("failed to retrieve cache entry for formatter %v: %w", name, err) - } - - isNew := errors.Is(err, cache.ErrKeyNotFound) - hasChanged := !(isNew || (entry.Size == stat.Size() && entry.Modified == stat.ModTime())) - - if isNew { - log.Debugf("formatter '%s' is new", name) - } else if hasChanged { - log.Debug("formatter '%s' has changed", - name, - "size", stat.Size(), - "modTime", stat.ModTime(), - "cachedSize", entry.Size, - "cachedModTime", entry.Modified, - ) - } - - // update overall flag - clearPaths = clearPaths || isNew || hasChanged - - // record formatters info - entry = &cache.Entry{ - Size: stat.Size(), - Modified: stat.ModTime(), - } - - if err = formattersBucket.Put(name, entry); err != nil { - return fmt.Errorf("failed to write cache entry for formatter %v: %w", name, err) - } - } - - // check for any removed formatters - if err = formattersBucket.ForEach(func(key string, _ *cache.Entry) error { - _, ok := c.formatters[key] - if !ok { - // remove the formatter entry from the cache - if err = formattersBucket.Delete(key); err != nil { - return fmt.Errorf("failed to remove cache entry for formatter %v: %w", key, err) - } - // indicate a clean is required - clearPaths = true - } - - return nil - }); err != nil { - return fmt.Errorf("failed to check cache for removed formatters: %w", err) - } - - if clearPaths { - // remove all path entries - if err := pathsBucket.DeleteAll(); err != nil { - return fmt.Errorf("failed to remove all path entries from cache: %w", err) - } - } - - return nil - }) -} - -// Close finalizes the processing of the CompositeFormatter, ensuring that any remaining batches are applied and -// all formatters have completed their tasks. It returns an error if any formatting failures were detected. -func (c *CompositeFormatter) Close(ctx context.Context) error { - // flush any partial batches that remain - for key, batch := range c.batches { - if len(batch) > 0 { - c.apply(ctx, key, batch) - } - } - - // wait for processing to complete - if err := c.eg.Wait(); err != nil { - return fmt.Errorf("failed to wait for formatters: %w", err) - } else if c.formatError.Load() { - return ErrFormattingFailures - } - - return nil -} - -func NewCompositeFormatter( - cfg *config.Config, - statz *stats.Stats, - batchSize int, -) (*CompositeFormatter, error) { - // compile global exclude globs - globalExcludes, err := compileGlobs(cfg.Excludes) - if err != nil { - return nil, fmt.Errorf("failed to compile global excludes: %w", err) - } - - // parse unmatched log level - unmatchedLevel, err := log.ParseLevel(cfg.OnUnmatched) - if err != nil { - return nil, fmt.Errorf("invalid on-unmatched value: %w", err) - } - - // create a composite formatter, adjusting the change logging based on --fail-on-change - changeLevel := log.DebugLevel - if cfg.FailOnChange { - changeLevel = log.ErrorLevel - } - - // create formatters - formatters := make(map[string]*Formatter) - - env := expand.ListEnviron(os.Environ()...) - - for name, formatterCfg := range cfg.FormatterConfigs { - formatter, err := newFormatter(name, cfg.TreeRoot, env, formatterCfg) - - if errors.Is(err, ErrCommandNotFound) && cfg.AllowMissingFormatter { - log.Debugf("formatter command not found: %v", name) - - continue - } else if err != nil { - return nil, fmt.Errorf("failed to initialise formatter %v: %w", name, err) - } - - // store formatter by name - formatters[name] = formatter - } - - // create an errgroup for asynchronously formatting - eg := errgroup.Group{} - // we use a simple heuristic to avoid too much contention by limiting the concurrency to runtime.NumCPU() - eg.SetLimit(runtime.NumCPU()) - - return &CompositeFormatter{ - stats: statz, - batchSize: batchSize, - globalExcludes: globalExcludes, - changeLevel: changeLevel, - unmatchedLevel: unmatchedLevel, - formatters: formatters, - eg: &eg, - batches: make(batchMap), - formatError: new(atomic.Bool), - }, nil -} diff --git a/format/formatter.go b/format/formatter.go index 5254565d..fcf1aaa3 100644 --- a/format/formatter.go +++ b/format/formatter.go @@ -4,9 +4,11 @@ import ( "context" "errors" "fmt" + "hash" "os" "os/exec" "regexp" + "strings" "time" "github.com/charmbracelet/log" @@ -52,6 +54,29 @@ func (f *Formatter) Executable() string { return f.executable } +// Hash adds this formatter's config and executable info to the config hash being created. +func (f *Formatter) Hash(h hash.Hash) error { + // including the name helps us to easily detect when formatter's have been added/removed + h.Write([]byte(f.name)) + // if options change, the outcome of applying the formatter might be different + h.Write([]byte(strings.Join(f.config.Options, " "))) + // if priority changes, the outcome of applying a sequence of formatters might be different + h.Write([]byte(fmt.Sprintf("%d", f.config.Priority))) + + // stat the formatter's executable + info, err := os.Lstat(f.executable) + if err != nil { + return fmt.Errorf("failed to stat formatter executable: %w", err) + } + + // include the executable's size and mod time + // if the formatter executable changes (e.g. new version) the outcome of applying the formatter might differ + h.Write([]byte(fmt.Sprintf("%d", info.Size()))) + h.Write([]byte(fmt.Sprintf("%d", info.ModTime().Unix()))) + + return nil +} + func (f *Formatter) Apply(ctx context.Context, files []*walk.File) error { start := time.Now() @@ -137,9 +162,9 @@ func newFormatter( // initialise internal state if cfg.Priority > 0 { - f.log = log.WithPrefix(fmt.Sprintf("format | %s[%d]", name, cfg.Priority)) + f.log = log.WithPrefix(fmt.Sprintf("formatter | %s[%d]", name, cfg.Priority)) } else { - f.log = log.WithPrefix(fmt.Sprintf("format | %s", name)) + f.log = log.WithPrefix(fmt.Sprintf("formatter | %s", name)) } f.includes, err = compileGlobs(cfg.Includes) diff --git a/format/formatter_test.go b/format/formatter_test.go index 795ed4f2..cff5f3b5 100644 --- a/format/formatter_test.go +++ b/format/formatter_test.go @@ -1,11 +1,15 @@ -package format_test +package format //nolint:testpackage import ( + "os" + "os/exec" + "path/filepath" "testing" + "time" "github.com/numtide/treefmt/config" - "github.com/numtide/treefmt/format" "github.com/numtide/treefmt/stats" + "github.com/numtide/treefmt/test" "github.com/stretchr/testify/require" ) @@ -20,7 +24,7 @@ func TestInvalidFormatterName(t *testing.T) { statz := stats.New() // simple "empty" config - _, err := format.NewCompositeFormatter(cfg, &statz, batchSize) + _, err := NewCompositeFormatter(cfg, &statz, batchSize) as.NoError(err) // valid name using all the acceptable characters @@ -30,7 +34,7 @@ func TestInvalidFormatterName(t *testing.T) { }, } - _, err = format.NewCompositeFormatter(cfg, &statz, batchSize) + _, err = NewCompositeFormatter(cfg, &statz, batchSize) as.NoError(err) // test with some bad examples @@ -43,7 +47,137 @@ func TestInvalidFormatterName(t *testing.T) { }, } - _, err = format.NewCompositeFormatter(cfg, &statz, batchSize) - as.ErrorIs(err, format.ErrInvalidName) + _, err = NewCompositeFormatter(cfg, &statz, batchSize) + as.ErrorIs(err, ErrInvalidName) } } + +func TestFormatSignature(t *testing.T) { + as := require.New(t) + + const batchSize = 1024 + + statz := stats.New() + + tempDir := t.TempDir() + + // symlink some formatters into temp dir, so we can mess with their mod times + binPath := filepath.Join(tempDir, "bin") + as.NoError(os.Mkdir(binPath, 0o755)) + + binaries := []string{"black", "elm-format", "gofmt"} + + for _, name := range binaries { + src, err := exec.LookPath(name) + as.NoError(err) + as.NoError(os.Symlink(src, filepath.Join(binPath, name))) + } + + // prepend our test bin directory to PATH + t.Setenv("PATH", binPath+":"+os.Getenv("PATH")) + + // start with 2 formatters + cfg := &config.Config{ + OnUnmatched: "info", + FormatterConfigs: map[string]*config.Formatter{ + "python": { + Command: "black", + Includes: []string{"*.py"}, + }, + "elm": { + Command: "elm-format", + Options: []string{"--yes"}, + Includes: []string{"*.elm"}, + }, + }, + } + + oldSignature := assertSignatureChangedAndStable(t, as, cfg, nil) + + t.Run("change formatter mod time", func(t *testing.T) { + for _, name := range []string{"black", "elm-format"} { + t.Logf("changing mod time of %s", name) + + // tweak mod time + newTime := time.Now().Add(-time.Minute) + as.NoError(test.Lutimes(t, filepath.Join(binPath, name), newTime, newTime)) + + oldSignature = assertSignatureChangedAndStable(t, as, cfg, oldSignature) + } + }) + + t.Run("modify formatter options", func(_ *testing.T) { + f, err := NewCompositeFormatter(cfg, &statz, batchSize) + as.NoError(err) + + oldSignature = assertSignatureChangedAndStable(t, as, cfg, nil) + + // adjust python includes + python := cfg.FormatterConfigs["python"] + python.Includes = []string{"*.py", "*.pyi"} + + newHash, err := f.signature() + as.NoError(err) + as.Equal(oldSignature, newHash, "hash should not have changed") + + // adjust python excludes + python.Excludes = []string{"*.pyi"} + + newHash, err = f.signature() + as.NoError(err) + as.Equal(oldSignature, newHash, "hash should not have changed") + + // adjust python options + python.Options = []string{"-w", "-s"} + oldSignature = assertSignatureChangedAndStable(t, as, cfg, oldSignature) + + // adjust python priority + python.Priority = 100 + oldSignature = assertSignatureChangedAndStable(t, as, cfg, oldSignature) + + // adjust command + python.Command = "deadnix" + oldSignature = assertSignatureChangedAndStable(t, as, cfg, oldSignature) + }) + + t.Run("add/remove formatters", func(_ *testing.T) { + cfg.FormatterConfigs["go"] = &config.Formatter{ + Command: "gofmt", + Options: []string{"-w"}, + Includes: []string{"*.go"}, + } + + oldSignature = assertSignatureChangedAndStable(t, as, cfg, oldSignature) + + // remove python formatter + delete(cfg.FormatterConfigs, "python") + oldSignature = assertSignatureChangedAndStable(t, as, cfg, oldSignature) + + // remove elm formatter + delete(cfg.FormatterConfigs, "elm") + oldSignature = assertSignatureChangedAndStable(t, as, cfg, oldSignature) + }) +} + +func assertSignatureChangedAndStable( + t *testing.T, + as *require.Assertions, + cfg *config.Config, + oldSignature signature, +) (h signature) { + t.Helper() + + statz := stats.New() + f, err := NewCompositeFormatter(cfg, &statz, 1024) + as.NoError(err) + + newHash, err := f.signature() + as.NoError(err) + as.NotEqual(oldSignature, newHash, "hash should have changed") + + sameHash, err := f.signature() + as.NoError(err) + as.Equal(newHash, sameHash, "hash should not have changed") + + return newHash +} diff --git a/format/scheduler.go b/format/scheduler.go new file mode 100644 index 00000000..86922427 --- /dev/null +++ b/format/scheduler.go @@ -0,0 +1,260 @@ +package format + +import ( + "bytes" + "cmp" + "context" + "crypto/md5" //nolint:gosec + "fmt" + "runtime" + "slices" + "strings" + "sync/atomic" + "time" + + "github.com/charmbracelet/log" + "github.com/numtide/treefmt/stats" + "github.com/numtide/treefmt/walk" + "golang.org/x/sync/errgroup" +) + +type ( + // batch represents a collection of File pointers to be processed together. + batch []*walk.File + + // batchKey represents the unique sequence of formatters to be applied to a batch of files. + // For example, "deadnix:statix:nixpkgs-fmt" indicates that deadnix should be applied first, statix second and + // nixpkgs-fmt third. + // Files are batched based on their formatting sequence, as determined by the priority and includes/excludes in the + // formatter configuration. + batchKey string + + // signature is a sha256 hash of a sequence of formatters. + signature []byte +) + +// sequence returns the list of formatters, by name, to be applied to a batch of files. +func (b batchKey) sequence() []string { + return strings.Split(string(b), batchKeySeparator) +} + +// newBatchKey takes a list of Formatters and returns a batchKey string composed of their names joined by ":". +func newBatchKey(formatters []*Formatter) batchKey { + components := make([]string, 0, len(formatters)) + for _, f := range formatters { + components = append(components, f.Name()) + } + + return batchKey(strings.Join(components, batchKeySeparator)) +} + +type scheduler struct { + batchSize int + changeLevel log.Level + formatters map[string]*Formatter + + eg *errgroup.Group + stats *stats.Stats + + batches map[batchKey]batch + signatures map[batchKey]signature + + // formatError indicates if at least one formatting error occurred + formatError *atomic.Bool +} + +func (s *scheduler) formattersSignature(key batchKey, formatters []*Formatter) ([]byte, error) { + sig, ok := s.signatures[key] + if ok { + // return pre-computed signature + return sig, nil + } + + // generate a signature by hashing each formatter in order + h := md5.New() //nolint:gosec + for _, f := range formatters { + if err := f.Hash(h); err != nil { + return nil, fmt.Errorf("failed to hash formatter %s: %w", f.Name(), err) + } + } + + sig = h.Sum(nil) + + // store we don't have to re-compute for each file + s.signatures[key] = sig + + return sig, nil +} + +func (s *scheduler) submit( + ctx context.Context, + file *walk.File, + matches []*Formatter, +) (accepted bool, err error) { + slices.SortFunc(matches, formatterSortFunc) + + // construct a batch key based on the sequence of formatters + key := newBatchKey(matches) + + // get format signature + formattersSig, err := s.formattersSignature(key, matches) + if err != nil { + return false, fmt.Errorf("failed to get formatter's signature: %w", err) + } + + // calculate the overall signature + signature, err := file.FormatSignature(formattersSig) + if err != nil { + return false, fmt.Errorf("failed to calculate file signature: %w", err) + } + + // compare signature with last cache entry + if bytes.Equal(signature, file.CachedFormatSignature) { + // If the signature is the same as the last cache entry, there is nothing to do. + // We know from the hash signature that we have already applied this sequence of formatters (and their config) to + // this file. + // When we applied the formatters, the file had the same mod time and file size. + return false, nil + } + + // append the formatters sig to the file + // it will be necessary later to calculate a new format signature + file.FormattersSignature = formattersSig + + // append to the batch + s.batches[key] = append(s.batches[key], file) + + // schedule the batch for processing if it's full + if len(s.batches[key]) == s.batchSize { + s.schedule(ctx, key, s.batches[key]) + // reset the batch + s.batches[key] = make([]*walk.File, 0, s.batchSize) + } + + return true, nil +} + +// schedule begins processing a batch in the background. +func (s *scheduler) schedule(ctx context.Context, key batchKey, batch []*walk.File) { + s.eg.Go(func() error { + var formatErrors []error + + // apply the formatters in sequence + for _, name := range key.sequence() { + formatter := s.formatters[name] + + if err := formatter.Apply(ctx, batch); err != nil { + formatErrors = append(formatErrors, err) + } + } + + // record if a format error occurred + hasErrors := len(formatErrors) > 0 + + // update overall error tracking + s.formatError.Store(hasErrors) + + if !hasErrors { + // record that the file was formatted + s.stats.Add(stats.Formatted, len(batch)) + } + + // Create a release context. + // We set no-cache based on whether any formatting errors occurred in this batch. + // This is to communicate with any caching layer, if used when reading files for this batch, that it should not + // update the state of any file in this batch, as we want to re-process them in later invocations. + releaseCtx := walk.SetNoCache(ctx, hasErrors) + + // post-processing + for _, file := range batch { + // check if the file has changed + changed, newInfo, err := file.Stat() + if err != nil { + return fmt.Errorf("failed to stat file: %w", err) + } + + if changed { + // record the change + s.stats.Add(stats.Changed, 1) + + // log the change (useful for diagnosing issues) + log.Log( + s.changeLevel, "file has changed", + "path", file.RelPath, + "prev_size", file.Info.Size(), + "prev_mod_time", file.Info.ModTime().Truncate(time.Second), + "current_size", newInfo.Size(), + "current_mod_time", newInfo.ModTime().Truncate(time.Second), + ) + + // record the new file info + file.FormattedInfo = newInfo + } + + // release the file as there is no further processing to be done on it + if err := file.Release(releaseCtx); err != nil { + return fmt.Errorf("failed to release file: %w", err) + } + } + + return nil + }) +} + +func (s *scheduler) close(ctx context.Context) error { + // schedule any partial batches that remain + for key, batch := range s.batches { + if len(batch) > 0 { + s.schedule(ctx, key, batch) + } + } + + // wait for processing to complete + if err := s.eg.Wait(); err != nil { + return fmt.Errorf("failed to wait for formatters: %w", err) + } else if s.formatError.Load() { + return ErrFormattingFailures + } + + return nil +} + +// formatterSortFunc sorts formatters by their priority in ascending order; ties are resolved by lexicographic order of +// names. +func formatterSortFunc(a, b *Formatter) int { + // sort by priority in ascending order + priorityA := a.Priority() + priorityB := b.Priority() + + result := priorityA - priorityB + if result == 0 { + // formatters with the same priority are sorted lexicographically to ensure a deterministic outcome + result = cmp.Compare(a.Name(), b.Name()) + } + + return result +} + +func newScheduler( + statz *stats.Stats, + batchSize int, + changeLevel log.Level, + formatters map[string]*Formatter, +) *scheduler { + eg := &errgroup.Group{} + // we use a simple heuristic to avoid too much contention by limiting the concurrency to runtime.NumCPU() + eg.SetLimit(runtime.NumCPU()) + + return &scheduler{ + batchSize: batchSize, + changeLevel: changeLevel, + formatters: formatters, + + eg: eg, + stats: statz, + + batches: make(map[batchKey]batch), + signatures: make(map[batchKey]signature), + formatError: &atomic.Bool{}, + } +} diff --git a/format/task.go b/format/task.go deleted file mode 100644 index 922dc6db..00000000 --- a/format/task.go +++ /dev/null @@ -1,45 +0,0 @@ -package format - -import ( - "cmp" - "slices" - - "github.com/numtide/treefmt/walk" -) - -type Task struct { - File *walk.File - Formatters []*Formatter - BatchKey string - Errors []error -} - -func NewTask(file *walk.File, formatters []*Formatter) Task { - // sort by priority in ascending order - slices.SortFunc(formatters, func(a, b *Formatter) int { - priorityA := a.Priority() - priorityB := b.Priority() - - result := priorityA - priorityB - if result == 0 { - // formatters with the same priority are sorted lexicographically to ensure a deterministic outcome - result = cmp.Compare(a.Name(), b.Name()) - } - - return result - }) - - // construct a batch key which represents the unique sequence of formatters to be applied to file - var key string - for _, f := range formatters { - key += f.name + ":" - } - - key = key[:len(key)-1] - - return Task{ - File: file, - Formatters: formatters, - BatchKey: key, - } -} diff --git a/go.mod b/go.mod index 7612b490..c41d6525 100644 --- a/go.mod +++ b/go.mod @@ -8,13 +8,14 @@ require ( github.com/charmbracelet/log v0.4.0 github.com/gobwas/glob v0.2.3 github.com/otiai10/copy v1.14.0 + github.com/rogpeppe/go-internal v1.12.0 github.com/spf13/cobra v1.8.1 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.19.0 github.com/stretchr/testify v1.9.0 - github.com/vmihailenco/msgpack/v5 v5.4.1 go.etcd.io/bbolt v1.3.11 golang.org/x/sync v0.8.0 + golang.org/x/sys v0.25.0 mvdan.cc/sh/v3 v3.9.0 ) @@ -43,11 +44,9 @@ require ( github.com/spf13/afero v1.11.0 // indirect github.com/spf13/cast v1.6.0 // indirect github.com/subosito/gotenv v1.6.0 // indirect - github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/sys v0.25.0 // indirect golang.org/x/term v0.24.0 // indirect golang.org/x/text v0.18.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect diff --git a/go.sum b/go.sum index 9cff6309..846246d1 100644 --- a/go.sum +++ b/go.sum @@ -99,10 +99,6 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= -github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8= -github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok= -github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= -github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds= go.etcd.io/bbolt v1.3.11 h1:yGEzV1wPz2yVCLsD8ZAiGHhHVlczyC9d1rP43/VCRJ0= go.etcd.io/bbolt v1.3.11/go.mod h1:dksAq7YMXoljX0xu6VF5DMZGbhYYoLUalEiSySYAS4I= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= diff --git a/nix/packages/treefmt/gomod2nix.toml b/nix/packages/treefmt/gomod2nix.toml index 9f6a2cf6..71bb3cdc 100644 --- a/nix/packages/treefmt/gomod2nix.toml +++ b/nix/packages/treefmt/gomod2nix.toml @@ -70,6 +70,9 @@ schema = 3 [mod."github.com/rivo/uniseg"] version = "v0.4.7" hash = "sha256-rDcdNYH6ZD8KouyyiZCUEy8JrjOQoAkxHBhugrfHjFo=" + [mod."github.com/rogpeppe/go-internal"] + version = "v1.12.0" + hash = "sha256-qvDNCe3l84/LgrA8X4O15e1FeDcazyX91m9LmXGXX6M=" [mod."github.com/sagikazarmark/locafero"] version = "v0.4.0" hash = "sha256-7I1Oatc7GAaHgAqBFO6Tv4IbzFiYeU9bJAfJhXuWaXk=" @@ -100,12 +103,6 @@ schema = 3 [mod."github.com/subosito/gotenv"] version = "v1.6.0" hash = "sha256-LspbjTniiq2xAICSXmgqP7carwlNaLqnCTQfw2pa80A=" - [mod."github.com/vmihailenco/msgpack/v5"] - version = "v5.4.1" - hash = "sha256-pDplX6xU6UpNLcFbO1pRREW5vCnSPvSU+ojAwFDv3Hk=" - [mod."github.com/vmihailenco/tagparser/v2"] - version = "v2.0.0" - hash = "sha256-M9QyaKhSmmYwsJk7gkjtqu9PuiqZHSmTkous8VWkWY0=" [mod."go.etcd.io/bbolt"] version = "v1.3.11" hash = "sha256-SVWYZtE9TBgAo8xJSmo9DtSwuNa056N3zGvPLDJgiA8=" diff --git a/test/temp.go b/test/test.go similarity index 71% rename from test/temp.go rename to test/test.go index 274cb1dc..f243fbe7 100644 --- a/test/temp.go +++ b/test/test.go @@ -1,6 +1,7 @@ package test import ( + "fmt" "os" "testing" "time" @@ -9,6 +10,7 @@ import ( "github.com/numtide/treefmt/config" cp "github.com/otiai10/copy" "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" ) func WriteConfig(t *testing.T, path string, cfg *config.Config) { @@ -60,13 +62,20 @@ func TempFile(t *testing.T, dir string, pattern string, contents *string) *os.Fi return file } -func RecreateSymlink(t *testing.T, path string) error { +// Lutimes is a convenience wrapper for using unix.Lutimes +// TODO: this will need adapted if we support Windows. +func Lutimes(t *testing.T, path string, atime time.Time, mtime time.Time) error { t.Helper() - src, err := os.Readlink(path) + var utimes [2]unix.Timeval + utimes[0] = unix.NsecToTimeval(atime.UnixNano()) + utimes[1] = unix.NsecToTimeval(mtime.UnixNano()) - require.NoError(t, err, "failed to read symlink") - require.NoError(t, os.Remove(path), "failed to remove symlink") + // Change the timestamps of the path. If it's a symlink, it updates the symlink's timestamps, not the target's. + err := unix.Lutimes(path, utimes[0:]) + if err != nil { + return fmt.Errorf("failed to change times: %w", err) + } - return os.Symlink(src, path) + return nil } diff --git a/walk/cache/bucket.go b/walk/cache/bucket.go deleted file mode 100644 index dd2334e0..00000000 --- a/walk/cache/bucket.go +++ /dev/null @@ -1,100 +0,0 @@ -package cache - -import ( - "fmt" - - "github.com/vmihailenco/msgpack/v5" - bolt "go.etcd.io/bbolt" -) - -const ( - bucketPaths = "paths" - bucketFormatters = "formatters" -) - -var ErrKeyNotFound = fmt.Errorf("key not found") - -type Bucket[V any] struct { - bucket *bolt.Bucket -} - -func (b *Bucket[V]) Size() int { - return b.bucket.Stats().KeyN -} - -func (b *Bucket[V]) Get(key string) (*V, error) { - bytes := b.bucket.Get([]byte(key)) - if bytes == nil { - return nil, ErrKeyNotFound - } - - var value V - if err := msgpack.Unmarshal(bytes, &value); err != nil { - return nil, fmt.Errorf("failed to unmarshal cache entry for key '%v': %w", key, err) - } - - return &value, nil -} - -func (b *Bucket[V]) Put(key string, value *V) error { - if bytes, err := msgpack.Marshal(value); err != nil { - return fmt.Errorf("failed to marshal cache entry for key %v: %w", key, err) - } else if err = b.bucket.Put([]byte(key), bytes); err != nil { - return fmt.Errorf("failed to put cache entry for key %v: %w", key, err) - } - - return nil -} - -func (b *Bucket[V]) Delete(key string) error { - return b.bucket.Delete([]byte(key)) -} - -func (b *Bucket[V]) DeleteAll() error { - c := b.bucket.Cursor() - for k, v := c.First(); !(k == nil && v == nil); k, v = c.Next() { - if err := c.Delete(); err != nil { - return fmt.Errorf("failed to remove cache entry for key %s: %w", string(k), err) - } - } - - return nil -} - -func (b *Bucket[V]) ForEach(f func(string, *V) error) error { - return b.bucket.ForEach(func(key, bytes []byte) error { - var value V - if err := msgpack.Unmarshal(bytes, &value); err != nil { - return fmt.Errorf("failed to unmarshal cache entry for key '%v': %w", key, err) - } - - return f(string(key), &value) - }) -} - -func BucketPaths(tx *bolt.Tx) (*Bucket[Entry], error) { - return cacheBucket(bucketPaths, tx) -} - -func BucketFormatters(tx *bolt.Tx) (*Bucket[Entry], error) { - return cacheBucket(bucketFormatters, tx) -} - -func cacheBucket(name string, tx *bolt.Tx) (*Bucket[Entry], error) { - var ( - err error - b *bolt.Bucket - ) - - if tx.Writable() { - b, err = tx.CreateBucketIfNotExists([]byte(name)) - } else { - b = tx.Bucket([]byte(name)) - } - - if err != nil { - return nil, fmt.Errorf("failed to get/create bucket %s: %w", bucketPaths, err) - } - - return &Bucket[Entry]{b}, nil -} diff --git a/walk/cache/cache.go b/walk/cache/cache.go index fba706f0..fc6c94fc 100644 --- a/walk/cache/cache.go +++ b/walk/cache/cache.go @@ -1,24 +1,18 @@ package cache import ( - "crypto/sha1" //nolint:gosec + "crypto/sha256" "encoding/hex" "fmt" - "io/fs" "time" "github.com/adrg/xdg" bolt "go.etcd.io/bbolt" ) -type Entry struct { - Size int64 - Modified time.Time -} - -func (e *Entry) HasChanged(info fs.FileInfo) bool { - return !(e.Modified == info.ModTime() && e.Size == info.Size()) -} +const ( + bucketPaths = "paths" +) func Open(root string) (*bolt.DB, error) { var ( @@ -29,7 +23,7 @@ func Open(root string) (*bolt.DB, error) { // Otherwise, the database will be located in `XDG_CACHE_DIR/treefmt/eval-cache/.db`, where is // determined by hashing the treeRoot path. // This associates a given treeRoot with a given instance of the cache. - digest := sha1.Sum([]byte(root)) //nolint:gosec + digest := sha256.Sum256([]byte(root)) name := hex.EncodeToString(digest[:]) if path, err = xdg.CacheFile(fmt.Sprintf("treefmt/eval-cache/%v.db", name)); err != nil { @@ -42,29 +36,36 @@ func Open(root string) (*bolt.DB, error) { return nil, err } + // ensure bucket exist + err = db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucketIfNotExists([]byte(bucketPaths)) + + return err + }) + if err != nil { + return nil, fmt.Errorf("failed to create bucket: %w", err) + } + return db, nil } -func EnsureBuckets(db *bolt.DB) error { - // force creation of buckets if they don't already exist - return db.Update(func(tx *bolt.Tx) error { - if _, err := BucketPaths(tx); err != nil { - return err - } +func PathsBucket(tx *bolt.Tx) *bolt.Bucket { + return tx.Bucket([]byte("paths")) +} - _, err := BucketFormatters(tx) +func deleteAll(bucket *bolt.Bucket) error { + c := bucket.Cursor() + for k, v := c.First(); !(k == nil && v == nil); k, v = c.Next() { + if err := c.Delete(); err != nil { + return fmt.Errorf("failed to remove cache entry for key %s: %w", string(k), err) + } + } - return err - }) + return nil } func Clear(db *bolt.DB) error { return db.Update(func(tx *bolt.Tx) error { - bucket, err := BucketPaths(tx) - if err != nil { - return fmt.Errorf("failed to get paths bucket: %w", err) - } - - return bucket.DeleteAll() + return deleteAll(PathsBucket(tx)) }) } diff --git a/walk/cached.go b/walk/cached.go index cfecc902..4f39037c 100644 --- a/walk/cached.go +++ b/walk/cached.go @@ -52,20 +52,17 @@ func (c *CachedReader) process() error { } return c.db.Update(func(tx *bolt.Tx) error { - // get the paths bucket - bucket, err := cache.BucketPaths(tx) - if err != nil { - return fmt.Errorf("failed to get bucket: %w", err) - } + bucket := cache.PathsBucket(tx) - // for each file in the batch, add a new cache entry with update size and mod time. + // for each file in the batch, calculate its new format signature and update the bucket entry for _, file := range batch { - entry := &cache.Entry{ - Size: file.Info.Size(), - Modified: file.Info.ModTime(), + signature, err := file.NewFormatSignature() + if err != nil { + return fmt.Errorf("failed to calculate signature for path %s: %w", file.RelPath, err) } - if err = bucket.Put(file.RelPath, entry); err != nil { - return fmt.Errorf("failed to put entry for path %s: %w", file.RelPath, err) + + if err := bucket.Put([]byte(file.RelPath), signature); err != nil { + return fmt.Errorf("failed to put format signature for path %s: %w", file.RelPath, err) } } @@ -92,7 +89,8 @@ func (c *CachedReader) process() error { func (c *CachedReader) Read(ctx context.Context, files []*File) (n int, err error) { err = c.db.View(func(tx *bolt.Tx) error { // get paths bucket - bucket, err := cache.BucketPaths(tx) + bucket := cache.PathsBucket(tx) + if err != nil { return fmt.Errorf("failed to get bucket: %w", err) } @@ -104,13 +102,7 @@ func (c *CachedReader) Read(ctx context.Context, files []*File) (n int, err erro for i := 0; i < n; i++ { file := files[i] - // lookup cache entry and append to the file - var bucketErr error - - file.Cache, bucketErr = bucket.Get(file.RelPath) - if !(bucketErr == nil || errors.Is(bucketErr, cache.ErrKeyNotFound)) { - return bucketErr - } + file.CachedFormatSignature = bucket.Get([]byte(file.RelPath)) // set a release function which inserts this file into the update channel file.AddReleaseFunc(func(ctx context.Context) error { @@ -145,19 +137,13 @@ func (c *CachedReader) Close() error { // NewCachedReader creates a cache Reader instance, backed by a bolt DB and delegating reads to delegate. func NewCachedReader(db *bolt.DB, batchSize int, delegate Reader) (*CachedReader, error) { - // force the creation of the necessary buckets if we're dealing with an empty db - if err := cache.EnsureBuckets(db); err != nil { - return nil, fmt.Errorf("failed to create cache buckets: %w", err) - } - - // create an error group for managing the processing loop - eg := &errgroup.Group{} + eg := &errgroup.Group{} // create an error group for managing the processing loop r := &CachedReader{ db: db, batchSize: batchSize, delegate: delegate, - log: log.WithPrefix("walk[cache]"), + log: log.WithPrefix("walk | cache"), eg: eg, updateCh: make(chan *File, batchSize*runtime.NumCPU()), } diff --git a/walk/cached_test.go b/walk/cached_test.go deleted file mode 100644 index d74f981f..00000000 --- a/walk/cached_test.go +++ /dev/null @@ -1,137 +0,0 @@ -package walk_test - -import ( - "context" - "errors" - "io" - "os" - "path/filepath" - "testing" - "time" - - "github.com/numtide/treefmt/stats" - "github.com/numtide/treefmt/test" - "github.com/numtide/treefmt/walk" - "github.com/numtide/treefmt/walk/cache" - "github.com/stretchr/testify/require" -) - -func TestCachedReader(t *testing.T) { - as := require.New(t) - - batchSize := 1024 - tempDir := test.TempExamples(t) - - readAll := func(path string) (totalCount, newCount, changeCount int) { - statz := stats.New() - - db, err := cache.Open(tempDir) - as.NoError(err) - defer db.Close() - - delegate := walk.NewFilesystemReader(tempDir, path, &statz, batchSize) - reader, err := walk.NewCachedReader(db, batchSize, delegate) - as.NoError(err) - - for { - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) - - files := make([]*walk.File, 8) - n, err := reader.Read(ctx, files) - - totalCount += n - - for idx := 0; idx < n; idx++ { - file := files[idx] - - if file.Cache == nil { - newCount++ - } else if file.Cache.HasChanged(file.Info) { - changeCount++ - } - - as.NoError(file.Release(ctx)) - } - - cancel() - - if errors.Is(err, io.EOF) { - break - } - } - - as.NoError(reader.Close()) - - return totalCount, newCount, changeCount - } - - totalCount, newCount, changeCount := readAll("") - as.Equal(32, totalCount) - as.Equal(32, newCount) - as.Equal(0, changeCount) - - // read again, should be no changes - totalCount, newCount, changeCount = readAll("") - as.Equal(32, totalCount) - as.Equal(0, newCount) - as.Equal(0, changeCount) - - // change mod times on some files and try again - // we subtract a second to account for the 1 second granularity of modtime according to POSIX - modTime := time.Now().Add(-1 * time.Second) - - as.NoError(os.Chtimes(filepath.Join(tempDir, "treefmt.toml"), time.Now(), modTime)) - as.NoError(os.Chtimes(filepath.Join(tempDir, "shell/foo.sh"), time.Now(), modTime)) - as.NoError(os.Chtimes(filepath.Join(tempDir, "haskell/Nested/Foo.hs"), time.Now(), modTime)) - - totalCount, newCount, changeCount = readAll("") - as.Equal(32, totalCount) - as.Equal(0, newCount) - as.Equal(3, changeCount) - - // create some files and try again - _, err := os.Create(filepath.Join(tempDir, "new.txt")) - as.NoError(err) - - _, err = os.Create(filepath.Join(tempDir, "fizz.go")) - as.NoError(err) - - totalCount, newCount, changeCount = readAll("") - as.Equal(34, totalCount) - as.Equal(2, newCount) - as.Equal(0, changeCount) - - // modify some files - f, err := os.OpenFile(filepath.Join(tempDir, "new.txt"), os.O_WRONLY, 0o644) - as.NoError(err) - _, err = f.Write([]byte("foo")) - as.NoError(err) - as.NoError(f.Close()) - - f, err = os.OpenFile(filepath.Join(tempDir, "fizz.go"), os.O_WRONLY, 0o644) - as.NoError(err) - _, err = f.Write([]byte("bla")) - as.NoError(err) - as.NoError(f.Close()) - - totalCount, newCount, changeCount = readAll("") - as.Equal(34, totalCount) - as.Equal(0, newCount) - as.Equal(2, changeCount) - - // read some paths within the root - totalCount, newCount, changeCount = readAll("go") - as.Equal(2, totalCount) - as.Equal(0, newCount) - as.Equal(0, changeCount) - - totalCount, newCount, changeCount = readAll("elm/src") - as.Equal(1, totalCount) - as.Equal(0, newCount) - as.Equal(0, changeCount) - - totalCount, newCount, changeCount = readAll("haskell") - as.Equal(7, totalCount) - as.Equal(0, newCount) - as.Equal(0, changeCount) -} diff --git a/walk/filesystem.go b/walk/filesystem.go index 000aa9d8..7c22e605 100644 --- a/walk/filesystem.go +++ b/walk/filesystem.go @@ -128,7 +128,7 @@ func NewFilesystemReader( eg := errgroup.Group{} r := FilesystemReader{ - log: log.WithPrefix("walk[filesystem]"), + log: log.WithPrefix("walk | filesystem"), root: root, path: path, batchSize: batchSize, diff --git a/walk/git.go b/walk/git.go index 7681d150..fd079e68 100644 --- a/walk/git.go +++ b/walk/git.go @@ -119,6 +119,6 @@ func NewGitReader( path: path, stats: statz, eg: &errgroup.Group{}, - log: log.WithPrefix("walk[git]"), + log: log.WithPrefix("walk | git"), }, nil } diff --git a/walk/walk.go b/walk/walk.go index 9480563f..8c72735a 100644 --- a/walk/walk.go +++ b/walk/walk.go @@ -2,16 +2,15 @@ package walk import ( "context" + "crypto/md5" //nolint:gosec "errors" "fmt" "io" "io/fs" "os" "path/filepath" - "time" "github.com/numtide/treefmt/stats" - "github.com/numtide/treefmt/walk/cache" bolt "go.etcd.io/bbolt" ) @@ -35,12 +34,59 @@ type File struct { RelPath string Info fs.FileInfo - // Cache is the latest entry found for this file, if one exists. - Cache *cache.Entry + // FormattedInfo is the result of os.stat after formatting the file. + FormattedInfo fs.FileInfo + + // FormattersSignature represents the sequence of formatters and their config that was applied to this file. + FormattersSignature []byte + + // CachedFormatSignature is the last FormatSignature generated for this file, retrieved from the cache. + CachedFormatSignature []byte releaseFuncs []ReleaseFunc } +func formatSignature(formattersSig []byte, info fs.FileInfo) []byte { + h := md5.New() //nolint:gosec + h.Write(formattersSig) + // add mod time and size + h.Write([]byte(fmt.Sprintf("%v %v", info.ModTime().Unix(), info.Size()))) + + return h.Sum(nil) +} + +// FormatSignature takes the file's info from when it was traversed and appends it to formattersSig, generating +// a unique format signature which encapsulates the sequence of formatters that were applied to this file and the +// outcome. +func (f *File) FormatSignature(formattersSig []byte) ([]byte, error) { + if f.Info == nil { + return nil, fmt.Errorf("file has no info") + } + + return formatSignature(formattersSig, f.Info), nil +} + +// NewFormatSignature takes the file's info after being formatted and appends it to FormattersSignature, generating +// a unique format signature which encapsulates the sequence of formatters that were applied to this file and the +// outcome. +func (f *File) NewFormatSignature() ([]byte, error) { + info := f.FormattedInfo // we start by assuming the file was formatted + if info == nil { + // if it wasn't, we fall back to the original file info from when it was first read + info = f.Info + } + + if info == nil { + // ensure info is not nil + return nil, fmt.Errorf("file has no info") + } else if f.FormattersSignature == nil { + // ensure we have a formatters signature + return nil, fmt.Errorf("file has no formatters signature") + } + + return formatSignature(f.FormattersSignature, info), nil +} + // Release calls all registered release functions for the File and returns an error if any function fails. // Accepts a context which can be used to pass parameters to the release hooks. func (f *File) Release(ctx context.Context) error { @@ -76,7 +122,7 @@ func (f *File) Stat() (changed bool, info fs.FileInfo, err error) { // Some formatters mess with the mod time (e.g. dos2unix) but not to the same precision, // triggering false positives. // We truncate everything below a second. - if f.Info.ModTime().Truncate(time.Second) != current.ModTime().Truncate(time.Second) { + if f.Info.ModTime().Unix() != current.ModTime().Unix() { return true, current, nil }