Skip to content

Commit ab8bfc2

Browse files
authored
Merge pull request #18842 from ghouscht/defrag-fixes-backport-3.5
[3.5] fix(defrag): handle errors during defrag
2 parents 1b88fc4 + f26ff91 commit ab8bfc2

File tree

2 files changed

+90
-4
lines changed

2 files changed

+90
-4
lines changed

server/mvcc/backend/backend.go

+16-4
Original file line numberDiff line numberDiff line change
@@ -477,22 +477,21 @@ func (b *backend) defrag() error {
477477
b.readTx.Lock()
478478
defer b.readTx.Unlock()
479479

480-
b.batchTx.unsafeCommit(true)
481-
482-
b.batchTx.tx = nil
483-
484480
// Create a temporary file to ensure we start with a clean slate.
485481
// Snapshotter.cleanupSnapdir cleans up any of these that are found during startup.
486482
dir := filepath.Dir(b.db.Path())
487483
temp, err := ioutil.TempFile(dir, "db.tmp.*")
488484
if err != nil {
489485
return err
490486
}
487+
491488
options := bolt.Options{}
492489
if boltOpenOptions != nil {
493490
options = *boltOpenOptions
494491
}
495492
options.OpenFile = func(_ string, _ int, _ os.FileMode) (file *os.File, err error) {
493+
// gofail: var defragOpenFileError string
494+
// return nil, fmt.Errorf(defragOpenFileError)
496495
return temp, nil
497496
}
498497
// Don't load tmp db into memory regardless of opening options
@@ -515,13 +514,23 @@ func (b *backend) defrag() error {
515514
zap.String("current-db-size-in-use", humanize.Bytes(uint64(sizeInUse1))),
516515
)
517516
}
517+
518+
// Commit/stop and then reset current transactions (including the readTx)
519+
b.batchTx.unsafeCommit(true)
520+
b.batchTx.tx = nil
521+
518522
// gofail: var defragBeforeCopy struct{}
519523
err = defragdb(b.db, tmpdb, defragLimit)
520524
if err != nil {
521525
tmpdb.Close()
522526
if rmErr := os.RemoveAll(tmpdb.Path()); rmErr != nil {
523527
b.lg.Error("failed to remove db.tmp after defragmentation completed", zap.Error(rmErr))
524528
}
529+
530+
// restore the bbolt transactions if defragmentation fails
531+
b.batchTx.tx = b.unsafeBegin(true)
532+
b.readTx.tx = b.unsafeBegin(false)
533+
525534
return err
526535
}
527536

@@ -574,6 +583,9 @@ func (b *backend) defrag() error {
574583
}
575584

576585
func defragdb(odb, tmpdb *bolt.DB, limit int) error {
586+
// gofail: var defragdbFail string
587+
// return fmt.Errorf(defragdbFail)
588+
577589
// open a tx on tmpdb for writes
578590
tmptx, err := tmpdb.Begin(true)
579591
if err != nil {

tests/e2e/defrag_no_space_test.go

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Copyright 2024 The etcd Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package e2e
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"testing"
21+
"time"
22+
23+
"github.com/stretchr/testify/require"
24+
25+
"go.etcd.io/etcd/tests/v3/framework/e2e"
26+
)
27+
28+
func TestDefragNoSpace(t *testing.T) {
29+
tests := []struct {
30+
name string
31+
failpoint string
32+
err string
33+
}{
34+
{
35+
name: "no space (#18810) - can't open/create new bbolt db",
36+
failpoint: "defragOpenFileError",
37+
err: "no space",
38+
},
39+
{
40+
name: "defragdb failure",
41+
failpoint: "defragdbFail",
42+
err: "some random error",
43+
},
44+
}
45+
46+
for _, tc := range tests {
47+
t.Run(tc.name, func(t *testing.T) {
48+
e2e.BeforeTest(t)
49+
50+
clus, err := e2e.NewEtcdProcessCluster(t,
51+
&e2e.EtcdProcessClusterConfig{
52+
ClusterSize: 1,
53+
LogLevel: "debug",
54+
GoFailEnabled: true,
55+
},
56+
)
57+
require.NoError(t, err)
58+
t.Cleanup(func() { clus.Stop() })
59+
60+
member := clus.Procs[0]
61+
etcdctl := member.Etcdctl(e2e.ClientNonTLS, false, false)
62+
63+
require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), tc.failpoint, fmt.Sprintf(`return("%s")`, tc.err)))
64+
require.ErrorContains(t, etcdctl.Defragment(time.Minute), tc.err)
65+
66+
// Make sure etcd continues to run even after the failed defrag attempt
67+
require.NoError(t, etcdctl.Put("foo", "bar"))
68+
value, err := etcdctl.Get("foo")
69+
require.NoError(t, err)
70+
require.Len(t, value.Kvs, 1)
71+
require.Equal(t, "bar", string(value.Kvs[0].Value))
72+
})
73+
}
74+
}

0 commit comments

Comments
 (0)