biogo · kortschak · Sep 17, 2015 · Sep 15, 2015
diff --git a/bgzf/index/index.go b/bgzf/index/index.go
@@ -50,32 +50,54 @@ func NewChunkReader(r *bgzf.Reader, chunks []bgzf.Chunk) (*ChunkReader, error) {
 
 // Read satisfies the io.Reader interface.
 func (r *ChunkReader) Read(p []byte) (int, error) {
-	if len(r.chunks) == 0 || vOffset(r.r.LastChunk().End) >= vOffset(r.chunks[0].End) {
+	if len(r.chunks) == 0 {
+		return 0, io.EOF
+	}
+	last := r.r.LastChunk()
+	if vOffset(last.End) >= vOffset(r.chunks[0].End) {
 		return 0, io.EOF
 	}
 
 	// Ensure the byte slice does not extend beyond the end of
 	// the current chunk. We do not need to consider reading
 	// beyond the end of the block because the bgzf.Reader is in
 	// blocked mode and so will stop there anyway.
-	if r.r.LastChunk().End.File == r.chunks[0].End.File {
-		p = p[:min(len(p), int(r.chunks[0].End.Block-r.r.LastChunk().End.Block))]
+	want := int(r.chunks[0].End.Block)
+	if r.chunks[0].End.Block == 0 && r.chunks[0].End.File > last.End.File {
+		// Special case for when the current end block offset is zero.
+		// We pick an arbitrary length (the maximum progression).
+		// Because we must move past the current bgzf block to get
+		// to the current chunk end this is safe since the bgzf
+		// Reader is in blocked mode.
+		want = len(p)
 	}
-
-	n, err := r.r.Read(p)
+	var cursor int
+	if last.End.File == r.chunks[0].End.File {
+		// Our end is in the same block as the last chunk end
+		// so set the cursor to the chunk block end to prevent
+		// reading past the end of the chunk.
+		cursor = int(last.End.Block)
+	}
+	n, err := r.r.Read(p[:min(len(p), want-cursor)])
 	if err != nil {
 		if n != 0 && err == io.EOF {
 			err = nil
 		}
 		return n, err
 	}
-	if len(r.chunks) != 0 && vOffset(r.r.LastChunk().End) >= vOffset(r.chunks[0].End) {
+
+	// Check whether we are at or past the end of the current
+	// chunk or we have not made progress for reasons other than
+	// zero length p.
+	this := r.r.LastChunk()
+	if (len(p) != 0 && this == last) || vOffset(this.End) >= vOffset(r.chunks[0].End) {
 		r.chunks = r.chunks[1:]
 		if len(r.chunks) == 0 {
 			return n, io.EOF
 		}
 		err = r.r.Seek(r.chunks[0].Begin)
 	}
+
 	return n, err
 }
 

diff --git a/bgzf/index/index_test.go b/bgzf/index/index_test.go
@@ -8,6 +8,7 @@ import (
 	"bytes"
 	"flag"
 	"io"
+	"strings"
 	"testing"
 
 	"github.com/biogo/hts/bgzf"
@@ -114,3 +115,165 @@ func (s *S) TestIssue8(c *check.C) {
 		}
 	}
 }
+
+// issue10Tests are test cases for https://github.com/biogo/hts/issues/10.
+var issue10Tests = []struct {
+	words     []wordBlocks
+	chunks    []string
+	canSquash bool
+	canTrunc  bool
+}{
+	{
+		// This is semantically identical to the test case given in issue 10.
+		words:     commonWords,
+		chunks:    []string{"<three>", "<five>"},
+		canSquash: true,
+		canTrunc:  false,
+	},
+	{
+		words:     commonWords,
+		chunks:    []string{"<one>", "<two>", "<three>"},
+		canSquash: true,
+		canTrunc:  false,
+	},
+	{
+		words:     commonWords,
+		chunks:    []string{"<two>", "<three>", "<four>", "<five>"},
+		canSquash: true,
+		canTrunc:  true,
+	},
+	{
+		words:     commonWords,
+		chunks:    []string{"<three>", "<four>"},
+		canSquash: true,
+		canTrunc:  true,
+	},
+	{
+		words:     commonWords,
+		chunks:    []string{"<seven>", "<eight>"},
+		canSquash: true,
+		canTrunc:  true,
+	},
+	{
+		words:     commonWords,
+		chunks:    []string{"<zero>", "<one>", "<two>", "<three>", "<four>", "<five>", "<six>", "<seven>", "<eight>"},
+		canSquash: true,
+		canTrunc:  true,
+	},
+	{
+		// This case would never happen with an htslib-like index, but
+		// it is a possible use case and not prohibited, so test it.
+		words:  commonWords,
+		chunks: []string{"<three>", "<zero>", "<five>", "<seven>", "<two>", "<eight>", "<five>"},
+
+		// Not in order.
+		canSquash: false,
+		canTrunc:  false,
+	},
+}
+
+var commonWords = []wordBlocks{
+	// Begin:{File:0 Block:0} End:{File:0 Block:6}
+	// Begin:{File:0 Block:6} End:{File:0 Block:11}
+	{word: "<zero>"}, {word: "<one>", flush: true},
+	// Begin:{File:43 Block:0} End:{File:43 Block:5}
+	// Begin:{File:43 Block:5} End:{File:43 Block:12}
+	// Begin:{File:43 Block:12} End:{File:43 Block:18}
+	{word: "<two>"}, {word: "<three>"}, {word: "<four>", flush: true},
+	// Begin:{File:93 Block:0} End:{File:93 Block:6}
+	// Begin:{File:93 Block:6} End:{File:93 Block:11}
+	{word: "<five>"}, {word: "<six>"}, {word: "<seven>", flush: true},
+	// Begin:{File:142 Block:0} End:{File:142 Block:7}
+	{word: "<eight>"},
+}
+
+type wordBlocks struct {
+	word  string
+	flush bool
+}
+
+type word int
+
+func (w word) RefID() int { return 0 }
+func (w word) Start() int { return int(w) }
+func (w word) End() int   { return int(w + 1) }
+
+func (s *S) TestIssue10(c *check.C) {
+	for _, test := range issue10Tests {
+		var buf bytes.Buffer
+
+		// Write the set of words to a bgzf stream.
+		w := bgzf.NewWriter(&buf, *conc)
+		for _, wb := range test.words {
+			w.Write([]byte(wb.word))
+			if wb.flush {
+				w.Flush()
+			}
+		}
+		w.Close()
+
+		for _, strategy := range []MergeStrategy{nil, adjacent} {
+			if strategy != nil && !test.canSquash {
+				continue
+			}
+			for _, clean := range []bool{false, true} {
+				for _, truncFinal := range []bool{false, true} {
+					if truncFinal && !test.canTrunc {
+						continue
+					}
+					// Build an index into the words.
+					r, err := bgzf.NewReader(bytes.NewReader(buf.Bytes()), *conc)
+					c.Assert(err, check.Equals, nil)
+					idx := make(map[string]bgzf.Chunk)
+					for i, wb := range test.words {
+						p := make([]byte, len(wb.word))
+						n, err := r.Read(p)
+						c.Assert(err, check.Equals, nil)
+						c.Assert(string(p[:n]), check.Equals, wb.word)
+
+						last := r.LastChunk()
+						if !clean {
+							// This simulates the index construction behaviour
+							// that appears to be what is done by htslib. The
+							// behaviour of bgzf is to elide seeks that will not
+							// result in a productive read.
+							if i != 0 && test.words[i-1].flush {
+								last.Begin = idx[test.words[i-1].word].End
+							}
+						}
+						idx[wb.word] = last
+					}
+
+					var chunks []bgzf.Chunk
+					for _, w := range test.chunks {
+						chunks = append(chunks, idx[w])
+					}
+					var want string
+					if truncFinal {
+						want = strings.Join(test.chunks[:len(test.chunks)-1], "")
+						chunks[len(chunks)-2].End = chunks[len(chunks)-1].Begin
+						chunks = chunks[:len(chunks)-1]
+					} else {
+						want = strings.Join(test.chunks, "")
+					}
+
+					if strategy != nil {
+						chunks = strategy(chunks)
+					}
+					cr, err := NewChunkReader(r, chunks)
+					c.Assert(err, check.Equals, nil)
+
+					var got bytes.Buffer
+					io.Copy(&got, cr)
+					gotString := got.String()
+					c.Check(strings.Contains(gotString, want), check.Equals, true,
+						check.Commentf("clean=%t merge=%t trunc=%t chunks=%+v", clean, strategy != nil, truncFinal, chunks),
+					)
+					if gotString != want {
+						c.Logf("read over-run clean=%t merge=%t trunc=%t:\n\tgot: %q\n\twant:%q", clean, strategy != nil, truncFinal, gotString, want)
+					}
+				}
+			}
+		}
+	}
+}