Skip to content

Commit e60a399

Browse files
authored
Merge pull request #50 from nlnwa/eof_panic
Validate content-length and avoid panic on unexpected EOF
2 parents 16f1eea + 981bef7 commit e60a399

9 files changed

+122
-26
lines changed

block.go

+6
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type Block interface {
3434
// RawBytes returns the bytes of the Block
3535
RawBytes() (io.Reader, error)
3636
BlockDigest() string
37+
Size() int64
3738
IsCached() bool
3839
Cache() error
3940
}
@@ -117,4 +118,9 @@ func (block *genericBlock) BlockDigest() string {
117118
return block.blockDigestString
118119
}
119120

121+
func (block *genericBlock) Size() int64 {
122+
block.BlockDigest()
123+
return block.blockDigest.count
124+
}
125+
120126
var errContentReAccessed = errors.New("gowarc.Block: tried to access content twice")

digest.go

+22-7
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,23 @@ import (
2929

3030
type digest struct {
3131
hash.Hash
32-
name string
33-
hash string
32+
name string
33+
hash string
34+
count int64
35+
}
36+
37+
// Write (via the embedded io.Writer interface) adds more data to the running hash.
38+
// It never returns an error.
39+
func (d *digest) Write(p []byte) (n int, err error) {
40+
d.count += int64(len(p))
41+
return d.Hash.Write(p)
42+
}
43+
44+
// Sum appends the current hash to b and returns the resulting slice.
45+
// It does not change the underlying hash state.
46+
func (d *digest) Sum(b []byte) []byte {
47+
d.count += int64(len(b))
48+
return d.Hash.Sum(b)
3449
}
3550

3651
func (d *digest) format() string {
@@ -55,15 +70,15 @@ func newDigest(digestString string) (*digest, error) {
5570
}
5671
switch algorithm {
5772
case "md5":
58-
return &digest{md5.New(), algorithm, hash}, nil
73+
return &digest{md5.New(), algorithm, hash, 0}, nil
5974
case "sha1":
60-
return &digest{sha1.New(), algorithm, hash}, nil
75+
return &digest{sha1.New(), algorithm, hash, 0}, nil
6176
case "sha256":
62-
return &digest{sha256.New(), algorithm, hash}, nil
77+
return &digest{sha256.New(), algorithm, hash, 0}, nil
6378
case "sha512":
64-
return &digest{sha512.New(), algorithm, hash}, nil
79+
return &digest{sha512.New(), algorithm, hash, 0}, nil
6580
case "":
66-
return &digest{sha1.New(), "sha1", hash}, nil
81+
return &digest{sha1.New(), "sha1", hash, 0}, nil
6782
default:
6883
return nil, fmt.Errorf("unsupported digest algorithm '%s'", algorithm)
6984
}

httpblock.go

+10
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ func (block *httpRequestBlock) BlockDigest() string {
104104
return block.blockDigestString
105105
}
106106

107+
func (block *httpRequestBlock) Size() int64 {
108+
block.BlockDigest()
109+
return block.blockDigest.count
110+
}
111+
107112
func (block *httpRequestBlock) PayloadBytes() (io.Reader, error) {
108113
if block.filterReader == nil {
109114
block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest)
@@ -237,6 +242,11 @@ func (block *httpResponseBlock) BlockDigest() string {
237242
return block.blockDigestString
238243
}
239244

245+
func (block *httpResponseBlock) Size() int64 {
246+
block.BlockDigest()
247+
return block.blockDigest.count
248+
}
249+
240250
func (block *httpResponseBlock) PayloadBytes() (io.Reader, error) {
241251
if block.filterReader == nil {
242252
block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest)

record.go

+38
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,25 @@ type WarcRecord interface {
4545
// RevisitRef extracts a RevisitRef current record if it is a revisit record.
4646
RevisitRef() (*RevisitRef, error)
4747
// CreateRevisitRef creates a RevisitRef which references the current record.
48+
//
4849
// The RevisitRef might be used by another records ToRevisitRecord to create a revisit record referencing this record.
4950
CreateRevisitRef(profile string) (*RevisitRef, error)
5051
// Merge merges this record with its referenced record(s)
52+
//
5153
// It is implemented only for revisit records, but this function will be enhanced to also support segmented records.
5254
Merge(record ...WarcRecord) (WarcRecord, error)
55+
// ValidateDigest validates block and payload digests if present.
56+
//
57+
// If option FixDigest is set, an invalid or missing digest will be corrected in the header.
58+
// Digest validation requires the whole content block to be read. As a side effect the Content-Length field is also validated
59+
// and if option FixContentLength is set, a wrong content length will be corrected in the header.
60+
//
61+
// If the record is not cached, it might not be possible to read any content from this record after validation.
62+
//
63+
// The result is dependent on the SpecViolationPolicy option:
64+
// ErrIgnore: only fatal errors are returned.
65+
// ErrWarn: all errors found will be added to the Validation.
66+
// ErrFail: the first error is returned and no more validation is done.
5367
ValidateDigest(validation *Validation) error
5468
}
5569

@@ -369,11 +383,35 @@ func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err
369383
}
370384

371385
// ValidateDigest validates block and payload digests if present.
386+
//
372387
// If option FixDigest is set, an invalid or missing digest will be corrected in the header.
388+
// Digest validation requires the whole content block to be read. As a side effect the Content-Length field is also validated
389+
// and if option FixContentLength is set, a wrong content length will be corrected in the header.
390+
//
373391
// If the record is not cached, it might not be possible to read any content from this record after validation.
392+
//
393+
// The result is dependent on the SpecViolationPolicy option:
394+
// ErrIgnore: only fatal errors are returned.
395+
// ErrWarn: all errors found will be added to the Validation.
396+
// ErrFail: the first error is returned and no more validation is done.
374397
func (wr *warcRecord) ValidateDigest(validation *Validation) error {
375398
wr.Block().BlockDigest()
376399

400+
size := strconv.FormatInt(wr.block.Size(), 10)
401+
if wr.opts.errSpec > ErrIgnore {
402+
if wr.WarcHeader().Has(ContentLength) && size != wr.headers.Get(ContentLength) {
403+
switch wr.opts.errSpec {
404+
case ErrWarn:
405+
validation.addError(fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size))
406+
if wr.opts.fixContentLength {
407+
wr.WarcHeader().Set(ContentLength, size)
408+
}
409+
case ErrFail:
410+
return fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size)
411+
}
412+
}
413+
}
414+
377415
var blockDigest, payloadDigest *digest
378416
switch v := wr.Block().(type) {
379417
case *genericBlock:

recordbuilder.go

-14
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
package gowarc
1818

1919
import (
20-
"fmt"
2120
"github.com/nlnwa/gowarc/internal/diskbuffer"
2221
"io"
2322
"strconv"
@@ -151,19 +150,6 @@ func (rb *recordBuilder) validate(wr *warcRecord) (*Validation, error) {
151150
return validation, err
152151
}
153152

154-
if rb.opts.errSpec > ErrIgnore {
155-
if wr.WarcHeader().Has(ContentLength) && size != wr.headers.Get(ContentLength) {
156-
switch rb.opts.errSpec {
157-
case ErrWarn:
158-
validation.addError(fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size))
159-
if rb.opts.fixContentLength {
160-
wr.WarcHeader().Set(ContentLength, size)
161-
}
162-
case ErrFail:
163-
return validation, fmt.Errorf("content length mismatch. header: %v, actual: %v", wr.headers.Get(ContentLength), size)
164-
}
165-
}
166-
}
167153
return validation, err
168154
}
169155

revisitblock.go

+4
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ func (block *revisitBlock) PayloadDigest() string {
5454
return block.payloadDigestString
5555
}
5656

57+
func (block *revisitBlock) Size() int64 {
58+
return int64(len(block.headerBytes))
59+
}
60+
5761
func (block *revisitBlock) Write(w io.Writer) (int64, error) {
5862
p, err := block.RawBytes()
5963
if err != nil {

unmarshaler.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,11 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation
183183
_, err = u.gz.Read(b)
184184
}
185185
if err != io.EOF {
186-
panic(err)
186+
_ = u.gz.Close()
187+
return err
187188
}
188189
if err := u.gz.Close(); err != nil {
189-
panic(err)
190+
return err
190191
}
191192
}
192193
return err

unmarshaler_test.go

+35-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323
"github.com/stretchr/testify/assert"
2424
"github.com/stretchr/testify/require"
2525
"io/ioutil"
26-
"strconv"
2726
"strings"
2827
"testing"
2928
)
@@ -529,6 +528,41 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
529528
0,
530529
true,
531530
},
531+
{
532+
"short response record",
533+
[]WarcRecordOption{WithSpecViolationPolicy(ErrFail), WithSyntaxErrorPolicy(ErrFail), WithUnknownRecordTypePolicy(ErrIgnore)},
534+
"WARC/1.0\r\n" +
535+
"WARC-Date: 2017-03-06T04:03:53Z\r\n" +
536+
"WARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\n" +
537+
"WARC-Type: response\r\n" +
538+
"Content-Type: application/http;msgtype=response\r\n" +
539+
"Content-Length: 257\r\n" +
540+
"\r\n" +
541+
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
542+
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
543+
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the " +
544+
"\r\n\r\n",
545+
want{
546+
V1_0,
547+
Response,
548+
&WarcFields{
549+
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
550+
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
551+
&nameValue{Name: WarcType, Value: "response"},
552+
&nameValue{Name: ContentType, Value: "application/http;msgtype=response"},
553+
&nameValue{Name: ContentLength, Value: "257"},
554+
},
555+
&httpResponseBlock{},
556+
"HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
557+
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
558+
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the " +
559+
"\r\n\r\n",
560+
&Validation{},
561+
false,
562+
},
563+
0,
564+
true,
565+
},
532566
}
533567
for _, tt := range tests {
534568
t.Run(tt.name, func(t *testing.T) {
@@ -555,8 +589,6 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
555589
content, err := ioutil.ReadAll(r)
556590
assert.Nil(err)
557591

558-
contentLength, _ := strconv.Atoi(gotRecord.WarcHeader().Get(ContentLength))
559-
assert.Equal(contentLength, len(content), "ContentLength")
560592
assert.Equal(tt.want.content, string(content), "Content")
561593
assert.Equal(tt.wantOffset, gotOffset, "Offset")
562594

warcfieldsblock.go

+4
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ func (b *warcFieldsBlock) BlockDigest() string {
6161
return b.blockDigest.format()
6262
}
6363

64+
func (block *warcFieldsBlock) Size() int64 {
65+
return int64(len(block.content))
66+
}
67+
6468
func (b *warcFieldsBlock) Write(w io.Writer) (bytesWritten int64, err error) {
6569
bytesWritten, err = b.warcFields.Write(w)
6670
if err != nil {

0 commit comments

Comments
 (0)