diff --git a/CHANGELOG.md b/CHANGELOG.md index 09c016f1..9ce6153b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +unreleased +---------- + +- Treat io.ErrUnexpectedEOF as driver.ErrBadConn so database/sql discards the + connection. Since v1.12.0 this could result in permanently broken connections, + especially with CockroachDB which frequently sends partial messages ([#1299]). + +[#1299]: https://github.com/lib/pq/pull/1299 + v1.12.1 (2026-03-30) -------------------- diff --git a/conn_test.go b/conn_test.go index e40db804..b75f31b4 100644 --- a/conn_test.go +++ b/conn_test.go @@ -16,6 +16,7 @@ import ( "runtime" "strconv" "strings" + "sync/atomic" "testing" "time" @@ -507,7 +508,7 @@ func TestErrorDuringStartupClosesConn(t *testing.T) { func TestBadConn(t *testing.T) { t.Parallel() - for _, tt := range []error{io.EOF, &Error{Severity: pqerror.SeverityFatal}} { + for _, tt := range []error{io.EOF, &Error{Severity: pqerror.SeverityFatal}, io.ErrUnexpectedEOF} { t.Run(fmt.Sprintf("%s", tt), func(t *testing.T) { var cn conn err := cn.handleError(tt) @@ -521,6 +522,63 @@ func TestBadConn(t *testing.T) { } } +func TestUnexpectedEOF(t *testing.T) { + t.Parallel() + + // On the first "select truncate" it sends a correct RowDescription followed + // by a truncated DataRow (header declares 96 body bytes, only 5 are sent) + // and then close the connection. database/sql should discard the connection + // and retry, and subsequent queries succeed. + var failed atomic.Bool + f := pqtest.NewFake(t, func(f pqtest.Fake, cn net.Conn) { + f.Startup(cn, nil) + for { + code, q, ok := f.ReadMsg(cn) + if !ok { + return + } + switch code { + case proto.Terminate: + cn.Close() + return + case proto.Query: + switch q := string(q[:bytes.IndexByte(q, 0)]); { + case q == ";": // Ping() + f.WriteMsg(cn, proto.EmptyQueryResponse, "") + f.WriteMsg(cn, proto.ReadyForQuery, "I") + case q == "select truncate" && !failed.Swap(true): + f.WriteMsg(cn, proto.RowDescription, "\x00\x01truncate\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x19\xff\xff\xff\xff\xff\xff\x00\x00") + cn.Write([]byte("D\x00\x00\x00\x64short")) + cn.Close() + return + case q == "select truncate": + f.SimpleQuery(cn, "SELECT", "truncate", "1") + f.WriteMsg(cn, proto.ReadyForQuery, "I") + case q == "select okay": + f.SimpleQuery(cn, "SELECT", "okay", "1") + f.WriteMsg(cn, proto.ReadyForQuery, "I") + default: + panic(fmt.Sprintf("unexpected query: %q", q)) + } + } + } + }) + defer f.Close() + + db := pqtest.MustDB(t, f.DSN()) + db.SetMaxOpenConns(1) + db.SetMaxIdleConns(1) + + // This should work as database/sql retries for us. + pqtest.QueryRow[int](t, db, `select truncate`) + if !failed.Load() { + t.Fatal("select truncate never failed") + } + + // Make sure it doesn't break the connection. + pqtest.QueryRow[int](t, db, `select okay`) +} + func TestConnClose(t *testing.T) { // Ensure the underlying connection can be closed with Close after an error. t.Run("CloseBadConn", func(t *testing.T) { diff --git a/error.go b/error.go index 0851a66b..7d061875 100644 --- a/error.go +++ b/error.go @@ -307,7 +307,7 @@ func (cn *conn) handleError(reported error, query ...string) error { reported = driver.ErrBadConn } case error: - if err == io.EOF || err.Error() == "remote error: handshake failure" { + if err == io.EOF || err == io.ErrUnexpectedEOF || err.Error() == "remote error: handshake failure" { reported = driver.ErrBadConn } default: