From fd4c6757204b2a701213c9b7c42fa15e05ff143c Mon Sep 17 00:00:00 2001
From: Javi Fontan <jfontan@gmail.com>
Date: Tue, 16 Oct 2018 15:00:47 +0200
Subject: [PATCH] Improve siva index generation in ReadWriter

When using a ReadWriter the index in regenerated each time Index is
called. These are the steps to generate one usable index:

* Create new index merging index from file and current changes
* Remove duplicates
* Sort index by position

Also, to find a file in the index it is walked until a match is found.
This needs to be done each time a file has to be opened.

For small number of files this is OK but when a repo has a lot or
references the time spent here can be a lot.

Now there's a new index type called OrderedIndex that stores the
IndexEntries in lexicographic order. This allows to do binary searches
for faster file location and also makes possible update the index
instead of regenerating it each time.

Signed-off-by: Javi Fontan <jfontan@gmail.com>
---
 index.go           | 117 +++++++++++++++++++++++++++++++++++++++++++--
 readwriter.go      |   7 ++-
 readwriter_test.go |  64 ++++++++++++++++++++++++-
 writer.go          |   3 ++
 4 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/index.go b/index.go
index 64928ef..2206092 100644
--- a/index.go
+++ b/index.go
@@ -161,13 +161,25 @@ func (i *Index) WriteTo(w io.Writer) error {
 	return nil
 }
 
-func (s Index) Len() int           { return len(s) }
-func (s Index) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+// Len implements sort.Interface.
+func (s Index) Len() int { return len(s) }
+
+// Swap implements sort.Interface.
+func (s Index) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+
+// Less implements sort.Interface.
 func (s Index) Less(i, j int) bool { return s[i].absStart < s[j].absStart }
 
 // Filter returns a filtered version of the current Index removing duplicates
 // keeping the latest versions and filtering all the deleted files
 func (i *Index) Filter() Index {
+	index := i.filter()
+	sort.Sort(index)
+
+	return index
+}
+
+func (i *Index) filter() Index {
 	var f Index
 
 	seen := make(map[string]bool)
@@ -186,7 +198,6 @@ func (i *Index) Filter() Index {
 		f = append(f, e)
 	}
 
-	sort.Sort(f)
 	return f
 }
 
@@ -235,6 +246,106 @@ func (i Index) Glob(pattern string) ([]*IndexEntry, error) {
 	return matches, nil
 }
 
+// OrderedIndex is a specialized index lexicographically ordered. It has
+// methods to add or delete IndexEntries and maintain its order. Also has
+// as faster Find method.
+type OrderedIndex Index
+
+// Pos gets the position of the file in the index or where it should be
+// inserted if it's not already there.
+func (o OrderedIndex) Pos(path string) int {
+	if len(o) == 0 {
+		return 0
+	}
+
+	pos := sort.Search(len(o), func(i int) bool {
+		return o[i].Name >= path
+	})
+
+	return pos
+}
+
+// Update adds or deletes an IndexEntry to the index depending on the
+// FlagDeleted value.
+func (o OrderedIndex) Update(e *IndexEntry) OrderedIndex {
+	if e == nil {
+		return o
+	}
+
+	if e.Flags&FlagDeleted == 0 {
+		return o.Add(e)
+	}
+
+	return o.Delete(e.Name)
+}
+
+// Add returns an updated index with the new IndexEntry.
+func (o OrderedIndex) Add(e *IndexEntry) OrderedIndex {
+	if e == nil {
+		return o
+	}
+
+	if len(o) == 0 {
+		return OrderedIndex{e}
+	}
+
+	path := e.Name
+	pos := o.Pos(path)
+	if pos < len(o) && o[pos].Name == path {
+		o[pos] = e
+		return o
+	}
+
+	if pos == len(o) {
+		return append(o, e)
+	}
+
+	return append(o[:pos], append(Index{e}, o[pos:]...)...)
+}
+
+// Delete returns an updated index with the IndexEntry for the path deleted.
+func (o OrderedIndex) Delete(path string) OrderedIndex {
+	if len(o) == 0 {
+		return o
+	}
+
+	pos := o.Pos(path)
+	if pos < len(o) && o[pos].Name != path {
+		return o
+	}
+
+	return append(o[:pos], o[pos+1:]...)
+}
+
+// Find returns the IndexEntry for a path or nil. This version is faster than
+// Index.Find.
+func (o OrderedIndex) Find(path string) *IndexEntry {
+	if len(o) == 0 {
+		return nil
+	}
+
+	pos := o.Pos(path)
+	if pos >= 0 && pos < len(o) && o[pos].Name == path {
+		return o[pos]
+	}
+
+	return nil
+}
+
+// Sort orders the index lexicographically.
+func (o OrderedIndex) Sort() {
+	sort.Sort(o)
+}
+
+// Len implements sort.Interface.
+func (s OrderedIndex) Len() int { return len(s) }
+
+// Swap implements sort.Interface.
+func (s OrderedIndex) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
+
+// Less implements sort.Interface.
+func (s OrderedIndex) Less(i, j int) bool { return s[i].Name < s[j].Name }
+
 type IndexEntry struct {
 	Header
 	Start uint64
diff --git a/readwriter.go b/readwriter.go
index 94c4faf..f872b98 100644
--- a/readwriter.go
+++ b/readwriter.go
@@ -26,12 +26,17 @@ func NewReaderWriter(rw io.ReadWriteSeeker) (*ReadWriter, error) {
 	}
 
 	w := newWriter(rw)
+	w.oIndex = OrderedIndex(i.filter())
+	w.oIndex.Sort()
+
 	getIndexFunc := func() (Index, error) {
 		for _, e := range w.index {
 			e.absStart = uint64(end) + e.Start
 		}
-		return append(i, w.index...), nil
+
+		return Index(w.oIndex), nil
 	}
+
 	r := newReaderWithIndex(rw, getIndexFunc)
 	return &ReadWriter{r, w}, nil
 }
diff --git a/readwriter_test.go b/readwriter_test.go
index 6d31e09..d34ac33 100644
--- a/readwriter_test.go
+++ b/readwriter_test.go
@@ -67,7 +67,15 @@ func (s *ReadWriterSuite) testWriteRead(c *C, f *os.File, iter int) {
 
 		index, err := rw.Index()
 		c.Assert(err, IsNil)
-		c.Assert(len(index), Equals, iters*iter+i+1)
+
+		// index after the first iteration will contain the total amount
+		// of files
+		num := i + 1
+		if iter > 0 {
+			num = iters
+		}
+
+		c.Assert(len(index), Equals, num)
 
 		e := index.Find(curName)
 		c.Assert(e, NotNil)
@@ -172,3 +180,57 @@ func (_ dummyReadWriterSeeker) Write(p []byte) (n int, err error) {
 func (_ dummyReadWriterSeeker) Seek(offset int64, whence int) (n int64, err error) {
 	return
 }
+
+func (s *ReadWriterSuite) TestDelete(c *C) {
+	data := "data"
+
+	path := filepath.Join(s.tmpDir, c.TestName())
+	tmpFile, err := os.Create(path)
+	c.Assert(err, IsNil)
+	c.Assert(tmpFile, NotNil)
+
+	rw, err := siva.NewReaderWriter(tmpFile)
+	c.Assert(err, IsNil)
+
+	testSteps := []struct {
+		name  string
+		del   bool
+		files []string
+	}{
+		{"one", false, []string{"one"}},
+		{"two", false, []string{"one", "two"}},
+		{"three", false, []string{"one", "three", "two"}},
+		{"two", true, []string{"one", "three"}},
+		{"two", false, []string{"one", "three", "two"}},
+		{"four", true, []string{"one", "three", "two"}},
+		{"three", true, []string{"one", "two"}},
+	}
+
+	for _, t := range testSteps {
+		var flags siva.Flag
+		if t.del {
+			flags = siva.FlagDeleted
+		}
+
+		err := rw.WriteHeader(&siva.Header{
+			Name:  t.name,
+			Flags: flags,
+		})
+		c.Assert(err, IsNil)
+
+		written, err := rw.Write([]byte(data))
+		c.Assert(err, IsNil)
+		c.Assert(written, Equals, len(data))
+
+		err = rw.Flush()
+		c.Assert(err, IsNil)
+
+		index, err := rw.Index()
+		c.Assert(err, IsNil)
+
+		c.Assert(len(index), Equals, len(t.files))
+		for i, name := range t.files {
+			c.Assert(index[i].Name, Equals, name)
+		}
+	}
+}
diff --git a/writer.go b/writer.go
index 4cf9176..abfa1c8 100644
--- a/writer.go
+++ b/writer.go
@@ -21,6 +21,7 @@ type Writer interface {
 type writer struct {
 	w        *hashedWriter
 	index    Index
+	oIndex   OrderedIndex
 	current  *IndexEntry
 	position uint64
 	closed   bool
@@ -49,6 +50,8 @@ func (w *writer) WriteHeader(h *Header) error {
 	}
 
 	w.index = append(w.index, w.current)
+	w.oIndex = w.oIndex.Update(w.current)
+
 	return nil
 }