diff --git a/go/vt/topo/tablet.go b/go/vt/topo/tablet.go index 72ccfdb7135..fb542839c9f 100644 --- a/go/vt/topo/tablet.go +++ b/go/vt/topo/tablet.go @@ -338,15 +338,10 @@ func (ts *Server) CreateTablet(ctx context.Context, tablet *topodatapb.Tablet) e return err } tabletPath := path.Join(TabletsPath, topoproto.TabletAliasString(tablet.Alias), TabletFile) - if _, err = conn.Create(ctx, tabletPath, data); err != nil && !IsErrType(err, NodeExists) { + if _, err = conn.Create(ctx, tabletPath, data); err != nil { return err } - // Update ShardReplication in any case, to be sure. This is - // meant to fix the case when a Tablet record was created, but - // then the ShardReplication record was not (because for - // instance of a startup timeout). Upon running this code - // again, we want to fix ShardReplication. if updateErr := UpdateTabletReplicationData(ctx, ts, tablet); updateErr != nil { return updateErr } diff --git a/go/vt/topo/topotests/tablet_test.go b/go/vt/topo/topotests/tablet_test.go index 75d9fe4dc17..cc307a0badb 100644 --- a/go/vt/topo/topotests/tablet_test.go +++ b/go/vt/topo/topotests/tablet_test.go @@ -22,7 +22,6 @@ import ( "github.com/golang/protobuf/proto" "golang.org/x/net/context" - "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/memorytopo" topodatapb "vitess.io/vitess/go/vt/proto/topodata" @@ -61,26 +60,4 @@ func TestCreateTablet(t *testing.T) { if err != nil || len(sri.Nodes) != 1 || !proto.Equal(sri.Nodes[0].TabletAlias, alias) { t.Fatalf("Created ShardReplication doesn't match: %v %v", sri, err) } - - // Create the same tablet again, make sure it fails with ErrNodeExists. - if err := ts.CreateTablet(ctx, tablet); !topo.IsErrType(err, topo.NodeExists) { - t.Fatalf("CreateTablet(again) returned: %v", err) - } - - // Remove the ShardReplication record, try to create the - // tablets again, make sure it's fixed. - if err := topo.RemoveShardReplicationRecord(ctx, ts, cell, keyspace, shard, alias); err != nil { - t.Fatalf("RemoveShardReplicationRecord failed: %v", err) - } - sri, err = ts.GetShardReplication(ctx, cell, keyspace, shard) - if err != nil || len(sri.Nodes) != 0 { - t.Fatalf("Modifed ShardReplication doesn't match: %v %v", sri, err) - } - if err := ts.CreateTablet(ctx, tablet); !topo.IsErrType(err, topo.NodeExists) { - t.Fatalf("CreateTablet(again and again) returned: %v", err) - } - sri, err = ts.GetShardReplication(ctx, cell, keyspace, shard) - if err != nil || len(sri.Nodes) != 1 || !proto.Equal(sri.Nodes[0].TabletAlias, alias) { - t.Fatalf("Created ShardReplication doesn't match: %v %v", sri, err) - } } diff --git a/go/vt/vttablet/tabletmanager/init_tablet.go b/go/vt/vttablet/tabletmanager/init_tablet.go index 54da38dac89..3d71cf76913 100644 --- a/go/vt/vttablet/tabletmanager/init_tablet.go +++ b/go/vt/vttablet/tabletmanager/init_tablet.go @@ -187,6 +187,15 @@ func (agent *ActionAgent) InitTablet(port, gRPCPort int32) error { return fmt.Errorf("InitTablet failed because existing tablet keyspace and shard %v/%v differ from the provided ones %v/%v", oldTablet.Keyspace, oldTablet.Shard, tablet.Keyspace, tablet.Shard) } + // Update ShardReplication in any case, to be sure. This is + // meant to fix the case when a Tablet record was created, but + // then the ShardReplication record was not (because for + // instance of a startup timeout). Upon running this code + // again, we want to fix ShardReplication. + if updateErr := topo.UpdateTabletReplicationData(ctx, agent.TopoServer, tablet); updateErr != nil { + return updateErr + } + // Then overwrite everything, ignoring version mismatch. if err := agent.TopoServer.UpdateTablet(ctx, topo.NewTabletInfo(tablet, nil)); err != nil { return fmt.Errorf("UpdateTablet failed: %v", err) diff --git a/go/vt/vttablet/tabletmanager/init_tablet_test.go b/go/vt/vttablet/tabletmanager/init_tablet_test.go index 0cbac8915ee..1fe63b9f358 100644 --- a/go/vt/vttablet/tabletmanager/init_tablet_test.go +++ b/go/vt/vttablet/tabletmanager/init_tablet_test.go @@ -22,6 +22,8 @@ import ( "golang.org/x/net/context" + "github.com/golang/protobuf/proto" + "vitess.io/vitess/go/history" "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/mysqlctl/fakemysqldaemon" @@ -31,6 +33,133 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) +// Init tablet fixes replication data when safe +func TestInitTabletFixesReplicationData(t *testing.T) { + ctx := context.Background() + cell := "cell1" + ts := memorytopo.NewServer(cell, "cell2") + tabletAlias := &topodatapb.TabletAlias{ + Cell: cell, + Uid: 1, + } + + // start with a tablet record that doesn't exist + port := int32(1234) + gRPCPort := int32(3456) + mysqlDaemon := fakemysqldaemon.NewFakeMysqlDaemon(nil) + agent := &ActionAgent{ + TopoServer: ts, + TabletAlias: tabletAlias, + MysqlDaemon: mysqlDaemon, + DBConfigs: &dbconfigs.DBConfigs{}, + BinlogPlayerMap: nil, + batchCtx: ctx, + History: history.New(historyLength), + _healthy: fmt.Errorf("healthcheck not run yet"), + } + + // 1. Initialize the tablet as REPLICA. + *tabletHostname = "localhost" + *initKeyspace = "test_keyspace" + *initShard = "-C0" + *initTabletType = "replica" + tabletAlias = &topodatapb.TabletAlias{ + Cell: cell, + Uid: 2, + } + agent.TabletAlias = tabletAlias + if err := agent.InitTablet(port, gRPCPort); err != nil { + t.Fatalf("InitTablet(type) failed: %v", err) + } + sri, err := ts.GetShardReplication(ctx, cell, *initKeyspace, "-c0") + if err != nil || len(sri.Nodes) != 1 || !proto.Equal(sri.Nodes[0].TabletAlias, tabletAlias) { + t.Fatalf("Created ShardReplication doesn't match: %v %v", sri, err) + } + + // Remove the ShardReplication record, try to create the + // tablets again, make sure it's fixed. + if err := topo.RemoveShardReplicationRecord(ctx, ts, cell, *initKeyspace, "-c0", tabletAlias); err != nil { + t.Fatalf("RemoveShardReplicationRecord failed: %v", err) + } + sri, err = ts.GetShardReplication(ctx, cell, *initKeyspace, "-c0") + if err != nil || len(sri.Nodes) != 0 { + t.Fatalf("Modifed ShardReplication doesn't match: %v %v", sri, err) + } + + // Initialize the same tablet again, CreateTablet will fail, but it should recreate shard replication data + if err := agent.InitTablet(port, gRPCPort); err != nil { + t.Fatalf("InitTablet(type) failed: %v", err) + } + + sri, err = ts.GetShardReplication(ctx, cell, *initKeyspace, "-c0") + if err != nil || len(sri.Nodes) != 1 || !proto.Equal(sri.Nodes[0].TabletAlias, tabletAlias) { + t.Fatalf("Created ShardReplication doesn't match: %v %v", sri, err) + } +} + +// This is a test to make sure a regression does not happen in the future. +// There is code in InitTablet that updates replication data if tablet fails +// to be created due to a NodeExists error. During this particular error we were not doing +// the sanity checks that the provided tablet was the same in the topo. +func TestInitTabletDoesNotUpdateReplicationDataForTabletInWrongShard(t *testing.T) { + ctx := context.Background() + ts := memorytopo.NewServer("cell1", "cell2") + tabletAlias := &topodatapb.TabletAlias{ + Cell: "cell1", + Uid: 1, + } + + // start with a tablet record that doesn't exist + port := int32(1234) + gRPCPort := int32(3456) + mysqlDaemon := fakemysqldaemon.NewFakeMysqlDaemon(nil) + agent := &ActionAgent{ + TopoServer: ts, + TabletAlias: tabletAlias, + MysqlDaemon: mysqlDaemon, + DBConfigs: &dbconfigs.DBConfigs{}, + BinlogPlayerMap: nil, + batchCtx: ctx, + History: history.New(historyLength), + _healthy: fmt.Errorf("healthcheck not run yet"), + } + + // 1. Initialize the tablet as REPLICA. + *tabletHostname = "localhost" + *initKeyspace = "test_keyspace" + *initShard = "-C0" + *initTabletType = "replica" + tabletAlias = &topodatapb.TabletAlias{ + Cell: "cell1", + Uid: 2, + } + agent.TabletAlias = tabletAlias + if err := agent.InitTablet(port, gRPCPort); err != nil { + t.Fatalf("InitTablet(type) failed: %v", err) + } + tabletAliases, err := ts.FindAllTabletAliasesInShard(ctx, "test_keyspace", "-c0") + if err != nil { + t.Fatalf("Could not fetch tablet aliases for shard: %v", err) + } + + if len(tabletAliases) != 1 { + t.Fatalf("Expected to have only one tablet alias, got: %v", len(tabletAliases)) + } + if tabletAliases[0].Uid != 2 { + t.Fatalf("Expected table UID be equal to 2, got: %v", tabletAliases[0].Uid) + } + + // Try to initialize a tablet with the same uid in a different shard. + *initShard = "-D0" + if err := agent.InitTablet(port, gRPCPort); err == nil { + t.Fatalf("InitTablet(type) should have failed, got nil") + } + + if _, err = ts.FindAllTabletAliasesInShard(ctx, "test_keyspace", "-d0"); err == nil { + t.Fatalf("Tablet shouldn't be added to replication data") + } +} + // TestInitTablet will test the InitTablet code creates / updates the // tablet node correctly. Note we modify global parameters (the flags) // so this has to be in one test.