diff --git a/httpblock.go b/httpblock.go index 29564ee..efe392e 100644 --- a/httpblock.go +++ b/httpblock.go @@ -22,7 +22,6 @@ import ( "errors" "fmt" "io" - "io/ioutil" "net/http" "github.com/nlnwa/gowarc/internal/diskbuffer" @@ -104,7 +103,7 @@ func (block *httpRequestBlock) BlockDigest() string { if block.filterReader == nil { block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest) } - _, _ = io.Copy(ioutil.Discard, block.filterReader) + _, _ = io.Copy(io.Discard, block.filterReader) block.blockDigestString = block.blockDigest.format() block.payloadDigestString = block.payloadDigest.format() } @@ -239,7 +238,7 @@ func (block *httpResponseBlock) BlockDigest() string { if block.filterReader == nil { block.filterReader = newDigestFilterReader(block.payload, block.blockDigest, block.payloadDigest) } - _, _ = io.Copy(ioutil.Discard, block.filterReader) + _, _ = io.Copy(io.Discard, block.filterReader) block.blockDigestString = block.blockDigest.format() block.payloadDigestString = block.payloadDigest.format() } @@ -401,8 +400,9 @@ func newHttpBlock(opts *warcRecordOptions, wf *WarcFields, r io.Reader, blockDig // We have to fix the header for parsing even if we don't fix the record hb = append(hb, '\r', '\n') } - if err := resp.parseHeaders(hb); err != nil && opts.errSyntax > ErrIgnore { - if opts.errSyntax == ErrWarn { + if err := resp.parseHeaders(hb); err != nil && opts.errBlock > ErrIgnore { + err = fmt.Errorf("error in http response block: %w", err) + if opts.errBlock == ErrWarn { validation.addError(err) } else { return resp, err @@ -422,8 +422,9 @@ func newHttpBlock(opts *warcRecordOptions, wf *WarcFields, r io.Reader, blockDig // We have to fix the header for parsing even if we don't fix the record hb = append(hb, '\r', '\n') } - if err := resp.parseHeaders(hb); err != nil && opts.errSyntax > ErrIgnore { - if opts.errSyntax == ErrWarn { + if err := resp.parseHeaders(hb); err != nil && opts.errBlock > ErrIgnore { + err = fmt.Errorf("error in http request block: %w", err) + if opts.errBlock == ErrWarn { validation.addError(err) } else { return resp, err diff --git a/options.go b/options.go index 3f7ee89..3eee66e 100644 --- a/options.go +++ b/options.go @@ -26,6 +26,7 @@ type warcRecordOptions struct { errSyntax errorPolicy errSpec errorPolicy errUnknownRecordType errorPolicy + errBlock errorPolicy skipParseBlock bool addMissingRecordId bool recordIdFunc func() (string, error) @@ -82,6 +83,7 @@ func defaultWarcRecordOptions() warcRecordOptions { errSyntax: ErrWarn, errSpec: ErrWarn, errUnknownRecordType: ErrWarn, + errBlock: ErrIgnore, skipParseBlock: false, addMissingRecordId: true, recordIdFunc: defaultIdGenerator, @@ -141,6 +143,17 @@ func WithUnknownRecordTypePolicy(policy errorPolicy) WarcRecordOption { }) } +// WithBlockErrorPolicy sets the policy for handling errors in block parsing. +// +// For most records this is the content fetched from the original source and errors here should be ignored. +// +// defaults to ErrIgnore +func WithBlockErrorPolicy(policy errorPolicy) WarcRecordOption { + return newFuncWarcRecordOption(func(o *warcRecordOptions) { + o.errBlock = policy + }) +} + // WithAddMissingRecordId sets if missing WARC-Record-ID header should be generated. // // defaults to true @@ -239,7 +252,7 @@ func WithFixSyntaxErrors(fixSyntaxErrors bool) WarcRecordOption { // WithFixWarcFieldsBlockErrors sets if an attempt to fix syntax errors in warcfields block should be done when those are detected. // -// # This will not have any impact if SyntaxErrorPolicy is ErrIgnore +// A warcfields block is typically generated by a web crawler. An error in this context suggests a potential bug in the crawler's WARC writer. // // defaults to false func WithFixWarcFieldsBlockErrors(fixWarcFieldsBlockErrors bool) WarcRecordOption { diff --git a/unmarshaler_test.go b/unmarshaler_test.go index a500352..243a83b 100644 --- a/unmarshaler_test.go +++ b/unmarshaler_test.go @@ -584,6 +584,97 @@ func Test_unmarshaler_Unmarshal(t *testing.T) { WithAddMissingRecordId(false), WithFixContentLength(false), }, + "WARC/1.0\r\n" + + "WARC-Date: 2017-03-06T04:03:53Z\r\n" + + "WARC-Record-ID: \r\n" + + "WARC-Type: metadata\r\n" + + "Content-Type: application/warc-fields\r\n" + + "Content-Length: 18\r\n" + + "WARC-Block-Digest: sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY\r\n" + + "\r\n" + + "foo: bar\n" + + "food:bar\n" + + "\r\n" + + "\r\n", + want{ + V1_0, + Metadata, + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: WarcType, Value: "metadata"}, + &nameValue{Name: ContentType, Value: "application/warc-fields"}, + &nameValue{Name: ContentLength, Value: "18"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY"}, + }, + &warcFieldsBlock{}, + "foo: bar\nfood:bar\n", + &Validation{}, + true, + }, + 0, + false, + }, + { + "metadata record missing carriage return in warc-fields block with fix syntax errors", + []WarcRecordOption{ + WithSpecViolationPolicy(ErrWarn), + WithSyntaxErrorPolicy(ErrWarn), + WithAddMissingDigest(true), + WithFixSyntaxErrors(true), + WithFixDigest(true), + WithAddMissingContentLength(false), + WithAddMissingRecordId(false), + WithFixContentLength(true), + WithFixWarcFieldsBlockErrors(true), + }, + "WARC/1.0\r\n" + + "WARC-Date: 2017-03-06T04:03:53Z\r\n" + + "WARC-Record-ID: \r\n" + + "WARC-Type: metadata\r\n" + + "Content-Type: application/warc-fields\r\n" + + "Content-Length: 18\r\n" + + "WARC-Block-Digest: sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY\r\n" + + "\r\n" + + "foo: bar\n" + + "food:bar\n" + + "\r\n" + + "\r\n", + want{ + V1_0, + Metadata, + &WarcFields{ + &nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"}, + &nameValue{Name: WarcRecordID, Value: ""}, + &nameValue{Name: WarcType, Value: "metadata"}, + &nameValue{Name: ContentType, Value: "application/warc-fields"}, + &nameValue{Name: ContentLength, Value: "21"}, + &nameValue{Name: WarcBlockDigest, Value: "sha1:U2AN4MFP7IITXSOLYH2QTIPVDNJOHBFO"}, + }, + &warcFieldsBlock{}, + "Foo: bar\r\nFood: bar\r\n", + &Validation{ + fmt.Errorf("content length mismatch. header: 18, actual: 21"), + fmt.Errorf("block: %w", fmt.Errorf("wrong digest: expected sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY, computed: sha1:U2AN4MFP7IITXSOLYH2QTIPVDNJOHBFO")), + }, + true, + }, + 0, + false, + }, + { + "metadata record missing carriage return in warc-fields block with BlockeErrorPolicy warn", + []WarcRecordOption{ + WithSpecViolationPolicy(ErrWarn), + WithSyntaxErrorPolicy(ErrWarn), + WithBlockErrorPolicy(ErrWarn), + WithAddMissingDigest(false), + WithFixSyntaxErrors(false), + WithFixDigest(false), + WithAddMissingContentLength(false), + WithAddMissingRecordId(false), + WithFixContentLength(false), + }, "WARC/1.0\r\n" + "WARC-Date: 2017-03-06T04:03:53Z\r\n" + "WARC-Record-ID: \r\n" + @@ -619,10 +710,11 @@ func Test_unmarshaler_Unmarshal(t *testing.T) { false, }, { - "metadata record missing carriage return in warc-fields block with fix syntax errors", + "metadata record missing carriage return in warc-fields block with fix syntax errors and BlockeErrorPolicy warn", []WarcRecordOption{ WithSpecViolationPolicy(ErrWarn), WithSyntaxErrorPolicy(ErrWarn), + WithBlockErrorPolicy(ErrWarn), WithAddMissingDigest(true), WithFixSyntaxErrors(true), WithFixDigest(true), diff --git a/warcfieldsblock.go b/warcfieldsblock.go index 6577306..d5daed2 100644 --- a/warcfieldsblock.go +++ b/warcfieldsblock.go @@ -80,7 +80,7 @@ func (block *warcFieldsBlock) Write(w io.Writer) (bytesWritten int64, err error) return } -func newWarcFieldsBlock(options *warcRecordOptions, wf *WarcFields, rb io.Reader, d *digest, validation *Validation) (WarcFieldsBlock, error) { +func newWarcFieldsBlock(options *warcRecordOptions, _ *WarcFields, rb io.Reader, d *digest, validation *Validation) (WarcFieldsBlock, error) { wfb := &warcFieldsBlock{blockDigest: d} var err error wfb.content, err = io.ReadAll(rb) @@ -95,11 +95,21 @@ func newWarcFieldsBlock(options *warcRecordOptions, wf *WarcFields, rb io.Reader p := &warcfieldsParser{options} blockValidation := Validation{} wfb.warcFields, err = p.Parse(bufio.NewReader(bytes.NewReader(wfb.content)), &blockValidation, &position{}) - for _, e := range blockValidation { - validation.addError(newWrappedSyntaxError("error in warc fields block", nil, e)) + if options.errBlock > ErrIgnore && !blockValidation.Valid() { + switch options.errBlock { + case ErrWarn: + for _, e := range blockValidation { + validation.addError(newWrappedSyntaxError("error in warc fields block", nil, e)) + } + case ErrFail: + if !blockValidation.Valid() { + err = newWrappedSyntaxError("error in warc fields block", nil, blockValidation[0]) + return wfb, err + } + } } - if !blockValidation.Valid() && options.fixWarcFieldsBlockErrors { + if options.fixWarcFieldsBlockErrors && !blockValidation.Valid() { // Write corrected warc fields block to content buffer b := bytes.Buffer{} _, err = wfb.WarcFields().Write(&b)