From fd4c6757204b2a701213c9b7c42fa15e05ff143c Mon Sep 17 00:00:00 2001 From: Javi Fontan Date: Tue, 16 Oct 2018 15:00:47 +0200 Subject: [PATCH] Improve siva index generation in ReadWriter When using a ReadWriter the index in regenerated each time Index is called. These are the steps to generate one usable index: * Create new index merging index from file and current changes * Remove duplicates * Sort index by position Also, to find a file in the index it is walked until a match is found. This needs to be done each time a file has to be opened. For small number of files this is OK but when a repo has a lot or references the time spent here can be a lot. Now there's a new index type called OrderedIndex that stores the IndexEntries in lexicographic order. This allows to do binary searches for faster file location and also makes possible update the index instead of regenerating it each time. Signed-off-by: Javi Fontan --- index.go | 117 +++++++++++++++++++++++++++++++++++++++++++-- readwriter.go | 7 ++- readwriter_test.go | 64 ++++++++++++++++++++++++- writer.go | 3 ++ 4 files changed, 186 insertions(+), 5 deletions(-) diff --git a/index.go b/index.go index 64928ef..2206092 100644 --- a/index.go +++ b/index.go @@ -161,13 +161,25 @@ func (i *Index) WriteTo(w io.Writer) error { return nil } -func (s Index) Len() int { return len(s) } -func (s Index) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +// Len implements sort.Interface. +func (s Index) Len() int { return len(s) } + +// Swap implements sort.Interface. +func (s Index) Swap(i, j int) { s[i], s[j] = s[j], s[i] } + +// Less implements sort.Interface. func (s Index) Less(i, j int) bool { return s[i].absStart < s[j].absStart } // Filter returns a filtered version of the current Index removing duplicates // keeping the latest versions and filtering all the deleted files func (i *Index) Filter() Index { + index := i.filter() + sort.Sort(index) + + return index +} + +func (i *Index) filter() Index { var f Index seen := make(map[string]bool) @@ -186,7 +198,6 @@ func (i *Index) Filter() Index { f = append(f, e) } - sort.Sort(f) return f } @@ -235,6 +246,106 @@ func (i Index) Glob(pattern string) ([]*IndexEntry, error) { return matches, nil } +// OrderedIndex is a specialized index lexicographically ordered. It has +// methods to add or delete IndexEntries and maintain its order. Also has +// as faster Find method. +type OrderedIndex Index + +// Pos gets the position of the file in the index or where it should be +// inserted if it's not already there. +func (o OrderedIndex) Pos(path string) int { + if len(o) == 0 { + return 0 + } + + pos := sort.Search(len(o), func(i int) bool { + return o[i].Name >= path + }) + + return pos +} + +// Update adds or deletes an IndexEntry to the index depending on the +// FlagDeleted value. +func (o OrderedIndex) Update(e *IndexEntry) OrderedIndex { + if e == nil { + return o + } + + if e.Flags&FlagDeleted == 0 { + return o.Add(e) + } + + return o.Delete(e.Name) +} + +// Add returns an updated index with the new IndexEntry. +func (o OrderedIndex) Add(e *IndexEntry) OrderedIndex { + if e == nil { + return o + } + + if len(o) == 0 { + return OrderedIndex{e} + } + + path := e.Name + pos := o.Pos(path) + if pos < len(o) && o[pos].Name == path { + o[pos] = e + return o + } + + if pos == len(o) { + return append(o, e) + } + + return append(o[:pos], append(Index{e}, o[pos:]...)...) +} + +// Delete returns an updated index with the IndexEntry for the path deleted. +func (o OrderedIndex) Delete(path string) OrderedIndex { + if len(o) == 0 { + return o + } + + pos := o.Pos(path) + if pos < len(o) && o[pos].Name != path { + return o + } + + return append(o[:pos], o[pos+1:]...) +} + +// Find returns the IndexEntry for a path or nil. This version is faster than +// Index.Find. +func (o OrderedIndex) Find(path string) *IndexEntry { + if len(o) == 0 { + return nil + } + + pos := o.Pos(path) + if pos >= 0 && pos < len(o) && o[pos].Name == path { + return o[pos] + } + + return nil +} + +// Sort orders the index lexicographically. +func (o OrderedIndex) Sort() { + sort.Sort(o) +} + +// Len implements sort.Interface. +func (s OrderedIndex) Len() int { return len(s) } + +// Swap implements sort.Interface. +func (s OrderedIndex) Swap(i, j int) { s[i], s[j] = s[j], s[i] } + +// Less implements sort.Interface. +func (s OrderedIndex) Less(i, j int) bool { return s[i].Name < s[j].Name } + type IndexEntry struct { Header Start uint64 diff --git a/readwriter.go b/readwriter.go index 94c4faf..f872b98 100644 --- a/readwriter.go +++ b/readwriter.go @@ -26,12 +26,17 @@ func NewReaderWriter(rw io.ReadWriteSeeker) (*ReadWriter, error) { } w := newWriter(rw) + w.oIndex = OrderedIndex(i.filter()) + w.oIndex.Sort() + getIndexFunc := func() (Index, error) { for _, e := range w.index { e.absStart = uint64(end) + e.Start } - return append(i, w.index...), nil + + return Index(w.oIndex), nil } + r := newReaderWithIndex(rw, getIndexFunc) return &ReadWriter{r, w}, nil } diff --git a/readwriter_test.go b/readwriter_test.go index 6d31e09..d34ac33 100644 --- a/readwriter_test.go +++ b/readwriter_test.go @@ -67,7 +67,15 @@ func (s *ReadWriterSuite) testWriteRead(c *C, f *os.File, iter int) { index, err := rw.Index() c.Assert(err, IsNil) - c.Assert(len(index), Equals, iters*iter+i+1) + + // index after the first iteration will contain the total amount + // of files + num := i + 1 + if iter > 0 { + num = iters + } + + c.Assert(len(index), Equals, num) e := index.Find(curName) c.Assert(e, NotNil) @@ -172,3 +180,57 @@ func (_ dummyReadWriterSeeker) Write(p []byte) (n int, err error) { func (_ dummyReadWriterSeeker) Seek(offset int64, whence int) (n int64, err error) { return } + +func (s *ReadWriterSuite) TestDelete(c *C) { + data := "data" + + path := filepath.Join(s.tmpDir, c.TestName()) + tmpFile, err := os.Create(path) + c.Assert(err, IsNil) + c.Assert(tmpFile, NotNil) + + rw, err := siva.NewReaderWriter(tmpFile) + c.Assert(err, IsNil) + + testSteps := []struct { + name string + del bool + files []string + }{ + {"one", false, []string{"one"}}, + {"two", false, []string{"one", "two"}}, + {"three", false, []string{"one", "three", "two"}}, + {"two", true, []string{"one", "three"}}, + {"two", false, []string{"one", "three", "two"}}, + {"four", true, []string{"one", "three", "two"}}, + {"three", true, []string{"one", "two"}}, + } + + for _, t := range testSteps { + var flags siva.Flag + if t.del { + flags = siva.FlagDeleted + } + + err := rw.WriteHeader(&siva.Header{ + Name: t.name, + Flags: flags, + }) + c.Assert(err, IsNil) + + written, err := rw.Write([]byte(data)) + c.Assert(err, IsNil) + c.Assert(written, Equals, len(data)) + + err = rw.Flush() + c.Assert(err, IsNil) + + index, err := rw.Index() + c.Assert(err, IsNil) + + c.Assert(len(index), Equals, len(t.files)) + for i, name := range t.files { + c.Assert(index[i].Name, Equals, name) + } + } +} diff --git a/writer.go b/writer.go index 4cf9176..abfa1c8 100644 --- a/writer.go +++ b/writer.go @@ -21,6 +21,7 @@ type Writer interface { type writer struct { w *hashedWriter index Index + oIndex OrderedIndex current *IndexEntry position uint64 closed bool @@ -49,6 +50,8 @@ func (w *writer) WriteHeader(h *Header) error { } w.index = append(w.index, w.current) + w.oIndex = w.oIndex.Update(w.current) + return nil }