-
Notifications
You must be signed in to change notification settings - Fork 15
A new stat for bytes read off the disk #117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,6 +49,7 @@ type docValueReader struct { | |
| curChunkHeader []MetaData | ||
| curChunkData []byte // compressed data cache | ||
| uncompressed []byte // temp buf for snappy decompression | ||
| bytesRead uint64 | ||
| } | ||
|
|
||
| func (di *docValueReader) size() int { | ||
|
|
@@ -96,6 +97,10 @@ func (s *SegmentBase) loadFieldDocValueReader(field string, | |
| chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) | ||
| // acquire position of chunk offsets | ||
| chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen | ||
|
|
||
| // 16 bytes since it corresponds to the length | ||
| // of chunk offsets and the position of the offsets | ||
| s.bytesRead += uint64(16) | ||
| } else { | ||
| return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart) | ||
| } | ||
|
|
@@ -116,13 +121,28 @@ func (s *SegmentBase) loadFieldDocValueReader(field string, | |
| fdvIter.chunkOffsets[i] = loc | ||
| offset += uint64(read) | ||
| } | ||
|
|
||
| s.bytesRead += offset | ||
| // set the data offset | ||
| fdvIter.dvDataLoc = fieldDvLocStart | ||
|
|
||
| return fdvIter, nil | ||
| } | ||
|
|
||
| // Implements the segment.DiskStatsReporter interface | ||
| // The purpose of this implementation is to get | ||
| // the bytes read from the disk (pertaining to the | ||
| // docvalues) while querying. | ||
| // the loadDvChunk retrieves the next chunk of docvalues | ||
| // and the bytes retrieved off the disk pertaining to that | ||
| // is accounted as well. | ||
| func (di *docValueReader) BytesRead() uint64 { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. commentary indicating that these methods are an interface implementation and the reference to that interface here would be useful.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
| return di.bytesRead | ||
| } | ||
|
|
||
| func (di *docValueReader) SetBytesRead(val uint64) { | ||
| di.bytesRead = val | ||
| } | ||
|
|
||
| func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { | ||
| // advance to the chunk where the docValues | ||
| // reside for the given docNum | ||
|
|
@@ -145,7 +165,7 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error | |
| return fmt.Errorf("failed to read the chunk") | ||
| } | ||
| chunkMetaLoc := destChunkDataLoc + uint64(read) | ||
|
|
||
| di.bytesRead += uint64(read) | ||
| offset := uint64(0) | ||
| if cap(di.curChunkHeader) < int(numDocs) { | ||
| di.curChunkHeader = make([]MetaData, int(numDocs)) | ||
|
|
@@ -161,6 +181,7 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error | |
|
|
||
| compressedDataLoc := chunkMetaLoc + offset | ||
| dataLength := curChunkEnd - compressedDataLoc | ||
| di.bytesRead += uint64(dataLength + offset) | ||
| di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] | ||
| di.curChunkNum = chunkNumber | ||
| di.uncompressed = di.uncompressed[:0] | ||
|
|
@@ -295,6 +316,7 @@ func (s *SegmentBase) VisitDocValues(localDocNum uint64, fields []string, | |
| if err != nil { | ||
| return dvs, err | ||
| } | ||
| s.bytesRead += dvr.BytesRead() | ||
| } | ||
|
|
||
| _ = dvr.visitDocValues(localDocNum, visitor) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -108,6 +108,7 @@ type PostingsList struct { | |
| normBits1Hit uint64 | ||
|
|
||
| chunkSize uint64 | ||
| bytesRead uint64 | ||
| } | ||
|
|
||
| // represents an immutable, empty postings list | ||
|
|
@@ -208,11 +209,13 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, | |
| // initialize freq chunk reader | ||
| if rv.includeFreqNorm { | ||
| rv.freqNormReader = newChunkedIntDecoder(p.sb.mem, p.freqOffset, rv.freqNormReader) | ||
| rv.bytesRead += rv.freqNormReader.bytesRead() | ||
| } | ||
|
|
||
| // initialize the loc chunk reader | ||
| if rv.includeLocs { | ||
| rv.locReader = newChunkedIntDecoder(p.sb.mem, p.locOffset, rv.locReader) | ||
| rv.bytesRead += rv.locReader.bytesRead() | ||
| } | ||
|
|
||
| rv.all = p.postings.Iterator() | ||
|
|
@@ -244,6 +247,18 @@ func (p *PostingsList) Count() uint64 { | |
| return n - e | ||
| } | ||
|
|
||
| // Implements the segment.DiskStatsReporter interface | ||
| // The purpose of this implementation is to get | ||
| // the bytes read from the postings lists stored | ||
| // on disk, while querying | ||
| func (p *PostingsList) SetBytesRead(val uint64) { | ||
| p.bytesRead = val | ||
| } | ||
|
|
||
| func (p *PostingsList) BytesRead() uint64 { | ||
| return p.bytesRead | ||
| } | ||
|
|
||
| func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { | ||
| rv.postingsOffset = postingsOffset | ||
|
|
||
|
|
@@ -268,6 +283,8 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { | |
|
|
||
| roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] | ||
|
|
||
| rv.bytesRead += (n + postingsLen) | ||
|
|
||
| if rv.postings == nil { | ||
| rv.postings = roaring.NewBitmap() | ||
| } | ||
|
|
@@ -316,6 +333,8 @@ type PostingsIterator struct { | |
|
|
||
| includeFreqNorm bool | ||
| includeLocs bool | ||
|
|
||
| bytesRead uint64 | ||
| } | ||
|
|
||
| var emptyPostingsIterator = &PostingsIterator{} | ||
|
|
@@ -331,19 +350,40 @@ func (i *PostingsIterator) Size() int { | |
| return sizeInBytes | ||
| } | ||
|
|
||
| // Implements the segment.DiskStatsReporter interface | ||
| // The purpose of this implementation is to get | ||
| // the bytes read from the disk which includes | ||
| // the freqNorm and location specific information | ||
| // of a hit | ||
| func (i *PostingsIterator) SetBytesRead(val uint64) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. commentary for the interface implementations..
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
| i.bytesRead = val | ||
| } | ||
|
|
||
| func (i *PostingsIterator) BytesRead() uint64 { | ||
| return i.bytesRead | ||
| } | ||
|
|
||
| func (i *PostingsIterator) loadChunk(chunk int) error { | ||
| if i.includeFreqNorm { | ||
| err := i.freqNormReader.loadChunk(chunk) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // assign the bytes read at this point, since | ||
| // the postingsIterator is tracking only the chunk loaded | ||
| // and the cumulation is tracked correctly in the downstream | ||
| // intDecoder | ||
| i.bytesRead = i.freqNormReader.bytesRead() | ||
|
|
||
| } | ||
|
|
||
| if i.includeLocs { | ||
| err := i.locReader.loadChunk(chunk) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| i.bytesRead = i.locReader.bytesRead() | ||
| } | ||
|
|
||
| i.currChunk = uint32(chunk) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -101,6 +101,7 @@ type SegmentBase struct { | |
| fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field | ||
| fieldDvNames []string // field names cached in fieldDvReaders | ||
| size uint64 | ||
| bytesRead uint64 | ||
|
|
||
| m sync.Mutex | ||
| fieldFSTs map[uint16]*vellum.FST | ||
|
|
@@ -210,9 +211,26 @@ func (s *Segment) loadConfig() error { | |
|
|
||
| numDocsOffset := storedIndexOffset - 8 | ||
| s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) | ||
|
|
||
| // 8*4 + 4*3 = 44 bytes being accounted from all the offsets | ||
| // above being read from the file | ||
| s.bytesRead += 44 | ||
| return nil | ||
| } | ||
|
|
||
| // Implements the segment.DiskStatsReporter interface | ||
| // Only the persistedSegment type implments the | ||
| // interface, as the intention is to retrieve the bytes | ||
| // read from the on-disk segment as part of the current | ||
| // query. | ||
| func (s *Segment) SetBytesRead(val uint64) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that I'm looking at this - feels a bit distasteful to allow another library to just overwrite this parameter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with you @abhinavdangeti, Why do we need this API? This would be inherent task within the merge process. Wouldn't that suffice the requirement? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm.. it may not possible without making disk format changes. |
||
| s.SegmentBase.bytesRead = val | ||
| } | ||
|
|
||
| func (s *Segment) BytesRead() uint64 { | ||
| return s.bytesRead + s.SegmentBase.bytesRead | ||
| } | ||
|
|
||
| func (s *SegmentBase) loadFields() error { | ||
| // NOTE for now we assume the fields index immediately precedes | ||
| // the footer, and if this changes, need to adjust accordingly (or | ||
|
|
@@ -224,6 +242,9 @@ func (s *SegmentBase) loadFields() error { | |
| for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { | ||
| addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) | ||
|
|
||
| // accounting the address of the dictLoc being read from file | ||
| s.bytesRead += 8 | ||
|
|
||
| dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd]) | ||
| n := uint64(read) | ||
| s.dictLocs = append(s.dictLocs, dictLoc) | ||
|
|
@@ -233,6 +254,7 @@ func (s *SegmentBase) loadFields() error { | |
| n += uint64(read) | ||
|
|
||
| name := string(s.mem[addr+n : addr+n+nameLen]) | ||
| s.bytesRead += (n + nameLen) | ||
| s.fieldsInv = append(s.fieldsInv, name) | ||
| s.fieldsMap[name] = uint16(fieldID + 1) | ||
|
|
||
|
|
@@ -267,6 +289,7 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { | |
| // read the length of the vellum data | ||
| vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) | ||
| fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] | ||
| sb.bytesRead += (uint64(read) + vellumLen) | ||
| rv.fst, err = vellum.Load(fstBytes) | ||
| if err != nil { | ||
| sb.m.Unlock() | ||
|
|
@@ -556,6 +579,7 @@ func (s *SegmentBase) loadDvReaders() error { | |
| } | ||
| read += uint64(n) | ||
|
|
||
| s.bytesRead += read | ||
| fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) | ||
| if err != nil { | ||
| return err | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some commentary here on why
16here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.