Skip to content

Commit

Permalink
Added helper methods for subnet failover + unit tests
Browse files Browse the repository at this point in the history
Added method to perform subnet failover

Added tests for subnet failover
  • Loading branch information
juanfont authored and kradalby committed Dec 6, 2022
1 parent b62acff commit 6718ff7
Show file tree
Hide file tree
Showing 3 changed files with 281 additions and 6 deletions.
17 changes: 13 additions & 4 deletions machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,17 @@ func (machine Machine) isExpired() bool {
return time.Now().UTC().After(*machine.Expiry)
}

// isOnline returns if the machine is connected to Headscale.
// This is really a naive implementation, as we don't really see
// if there is a working connection between the client and the server.
func (machine *Machine) isOnline() bool {
if machine.LastSeen == nil {
return false
}

return machine.LastSeen.After(time.Now().Add(-keepAliveInterval))
}

func containsAddresses(inputs []string, addrs []string) bool {
for _, addr := range addrs {
if contains(inputs, addr) {
Expand Down Expand Up @@ -708,9 +719,7 @@ func (h *Headscale) toNode(

hostInfo := machine.GetHostInfo()

// A node is Online if it is connected to the control server,
// and we now we update LastSeen every keepAliveInterval duration at least.
online := machine.LastSeen.After(time.Now().Add(-keepAliveInterval))
online := machine.isOnline()

node := tailcfg.Node{
ID: tailcfg.NodeID(machine.ID), // this is the actual ID
Expand Down Expand Up @@ -1027,7 +1036,7 @@ func (h *Headscale) EnableRoutes(machine *Machine, routeStrs ...string) error {

// Mark already as primary if there is only this node offering this subnet
// (and is not an exit route)
if prefix != ExitRouteV4 && prefix != ExitRouteV6 {
if !route.isExitRoute() {
route.IsPrimary = h.isUniquePrefix(route)
}

Expand Down
123 changes: 123 additions & 0 deletions routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"net/netip"

"github.com/rs/zerolog/log"
"gorm.io/gorm"
)

Expand Down Expand Up @@ -34,6 +35,10 @@ func (r *Route) String() string {
return fmt.Sprintf("%s:%s", r.Machine, netip.Prefix(r.Prefix).String())
}

func (r *Route) isExitRoute() bool {
return netip.Prefix(r.Prefix) == ExitRouteV4 || netip.Prefix(r.Prefix) == ExitRouteV6
}

func (rs Routes) toPrefixes() []netip.Prefix {
prefixes := make([]netip.Prefix, len(rs))
for i, r := range rs {
Expand All @@ -54,6 +59,23 @@ func (h *Headscale) isUniquePrefix(route Route) bool {
return count == 0
}

func (h *Headscale) getPrimaryRoute(prefix netip.Prefix) (*Route, error) {
var route Route
err := h.db.
Preload("Machine").
Where("prefix = ? AND advertised = ? AND enabled = ? AND is_primary = ?", IPPrefix(prefix), true, true, true).
First(&route).Error
if err != nil && err != gorm.ErrRecordNotFound {
return nil, err
}

if err == gorm.ErrRecordNotFound {
return nil, gorm.ErrRecordNotFound
}

return &route, nil
}

// getMachinePrimaryRoutes returns the routes that are enabled and marked as primary (for subnet failover)
// Exit nodes are not considered for this, as they are never marked as Primary
func (h *Headscale) getMachinePrimaryRoutes(m *Machine) ([]Route, error) {
Expand Down Expand Up @@ -120,3 +142,104 @@ func (h *Headscale) processMachineRoutes(machine *Machine) error {

return nil
}

func (h *Headscale) handlePrimarySubnetFailover() error {
// first, get all the enabled routes
var routes []Route
err := h.db.
Preload("Machine").
Where("advertised = ? AND enabled = ?", true, true).
Find(&routes).Error
if err != nil && err != gorm.ErrRecordNotFound {
log.Error().Err(err).Msg("error getting routes")
}

for _, route := range routes {
if route.isExitRoute() {
continue
}

if !route.IsPrimary {
_, err := h.getPrimaryRoute(netip.Prefix(route.Prefix))
if h.isUniquePrefix(route) || err == gorm.ErrRecordNotFound {
route.IsPrimary = true
err := h.db.Save(&route).Error
if err != nil {
log.Error().Err(err).Msg("error marking route as primary")

return err
}
continue
}
}

if route.IsPrimary {
if route.Machine.isOnline() {
continue
}

// machine offline, find a new primary
log.Info().
Str("machine", route.Machine.Hostname).
Str("prefix", netip.Prefix(route.Prefix).String()).
Msgf("machine offline, finding a new primary subnet")

// find a new primary route
var newPrimaryRoutes []Route
err := h.db.
Preload("Machine").
Where("prefix = ? AND machine_id != ? AND advertised = ? AND enabled = ?",
route.Prefix,
route.MachineID,
true, true).
Find(&newPrimaryRoutes).Error
if err != nil && err != gorm.ErrRecordNotFound {
log.Error().Err(err).Msg("error finding new primary route")

return err
}

var newPrimaryRoute *Route
for _, r := range newPrimaryRoutes {
if r.Machine.isOnline() {
newPrimaryRoute = &r
break
}
}

if newPrimaryRoute == nil {
log.Warn().
Str("machine", route.Machine.Hostname).
Str("prefix", netip.Prefix(route.Prefix).String()).
Msgf("no alternative primary route found")
continue
}

log.Info().
Str("old_machine", route.Machine.Hostname).
Str("prefix", netip.Prefix(route.Prefix).String()).
Str("new_machine", newPrimaryRoute.Machine.Hostname).
Msgf("found new primary route")

// disable the old primary route
route.IsPrimary = false
err = h.db.Save(&route).Error
if err != nil {
log.Error().Err(err).Msg("error disabling old primary route")

return err
}

// enable the new primary route
newPrimaryRoute.IsPrimary = true
err = h.db.Save(&newPrimaryRoute).Error
if err != nil {
log.Error().Err(err).Msg("error enabling new primary route")

return err
}
}
}

return nil
}
147 changes: 145 additions & 2 deletions routes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package headscale

import (
"net/netip"
"time"

"gopkg.in/check.v1"
"tailscale.com/tailcfg"
Expand Down Expand Up @@ -150,7 +151,7 @@ func (s *Suite) TestIsUniquePrefix(c *check.C) {
RoutableIPs: []netip.Prefix{route, route2},
}
machine1 := Machine{
ID: 0,
ID: 1,
MachineKey: "foo",
NodeKey: "bar",
DiscoKey: "faa",
Expand All @@ -175,7 +176,7 @@ func (s *Suite) TestIsUniquePrefix(c *check.C) {
RoutableIPs: []netip.Prefix{route2},
}
machine2 := Machine{
ID: 0,
ID: 2,
MachineKey: "foo",
NodeKey: "bar",
DiscoKey: "faa",
Expand Down Expand Up @@ -209,3 +210,145 @@ func (s *Suite) TestIsUniquePrefix(c *check.C) {
c.Assert(err, check.IsNil)
c.Assert(len(routes), check.Equals, 0)
}

func (s *Suite) TestSubnetFailover(c *check.C) {
namespace, err := app.CreateNamespace("test")
c.Assert(err, check.IsNil)

pak, err := app.CreatePreAuthKey(namespace.Name, false, false, nil, nil)
c.Assert(err, check.IsNil)

_, err = app.GetMachine("test", "test_enable_route_machine")
c.Assert(err, check.NotNil)

prefix, err := netip.ParsePrefix(
"10.0.0.0/24",
)
c.Assert(err, check.IsNil)

prefix2, err := netip.ParsePrefix(
"150.0.10.0/25",
)
c.Assert(err, check.IsNil)

hostInfo1 := tailcfg.Hostinfo{
RoutableIPs: []netip.Prefix{prefix, prefix2},
}

now := time.Now()
machine1 := Machine{
ID: 1,
MachineKey: "foo",
NodeKey: "bar",
DiscoKey: "faa",
Hostname: "test_enable_route_machine",
NamespaceID: namespace.ID,
RegisterMethod: RegisterMethodAuthKey,
AuthKeyID: uint(pak.ID),
HostInfo: HostInfo(hostInfo1),
LastSeen: &now,
}
app.db.Save(&machine1)

err = app.processMachineRoutes(&machine1)
c.Assert(err, check.IsNil)

err = app.EnableRoutes(&machine1, prefix.String())
c.Assert(err, check.IsNil)

err = app.EnableRoutes(&machine1, prefix2.String())
c.Assert(err, check.IsNil)

err = app.handlePrimarySubnetFailover()
c.Assert(err, check.IsNil)

enabledRoutes1, err := app.GetEnabledRoutes(&machine1)
c.Assert(err, check.IsNil)
c.Assert(len(enabledRoutes1), check.Equals, 2)

route, err := app.getPrimaryRoute(prefix)
c.Assert(err, check.IsNil)
c.Assert(route.MachineID, check.Equals, machine1.ID)

hostInfo2 := tailcfg.Hostinfo{
RoutableIPs: []netip.Prefix{prefix2},
}
machine2 := Machine{
ID: 2,
MachineKey: "foo",
NodeKey: "bar",
DiscoKey: "faa",
Hostname: "test_enable_route_machine",
NamespaceID: namespace.ID,
RegisterMethod: RegisterMethodAuthKey,
AuthKeyID: uint(pak.ID),
HostInfo: HostInfo(hostInfo2),
LastSeen: &now,
}
app.db.Save(&machine2)

err = app.processMachineRoutes(&machine2)
c.Assert(err, check.IsNil)

err = app.EnableRoutes(&machine2, prefix2.String())
c.Assert(err, check.IsNil)

err = app.handlePrimarySubnetFailover()
c.Assert(err, check.IsNil)

enabledRoutes1, err = app.GetEnabledRoutes(&machine1)
c.Assert(err, check.IsNil)
c.Assert(len(enabledRoutes1), check.Equals, 2)

enabledRoutes2, err := app.GetEnabledRoutes(&machine2)
c.Assert(err, check.IsNil)
c.Assert(len(enabledRoutes2), check.Equals, 1)

routes, err := app.getMachinePrimaryRoutes(&machine1)
c.Assert(err, check.IsNil)
c.Assert(len(routes), check.Equals, 2)

routes, err = app.getMachinePrimaryRoutes(&machine2)
c.Assert(err, check.IsNil)
c.Assert(len(routes), check.Equals, 0)

// lets make machine1 lastseen 10 mins ago
before := now.Add(-10 * time.Minute)
machine1.LastSeen = &before
err = app.db.Save(&machine1).Error
c.Assert(err, check.IsNil)

err = app.handlePrimarySubnetFailover()
c.Assert(err, check.IsNil)

routes, err = app.getMachinePrimaryRoutes(&machine1)
c.Assert(err, check.IsNil)
c.Assert(len(routes), check.Equals, 1)

routes, err = app.getMachinePrimaryRoutes(&machine2)
c.Assert(err, check.IsNil)
c.Assert(len(routes), check.Equals, 1)

machine2.HostInfo = HostInfo(tailcfg.Hostinfo{
RoutableIPs: []netip.Prefix{prefix, prefix2},
})
err = app.db.Save(&machine2).Error
c.Assert(err, check.IsNil)

err = app.processMachineRoutes(&machine2)
c.Assert(err, check.IsNil)

err = app.EnableRoutes(&machine2, prefix.String())
c.Assert(err, check.IsNil)

err = app.handlePrimarySubnetFailover()
c.Assert(err, check.IsNil)

routes, err = app.getMachinePrimaryRoutes(&machine1)
c.Assert(err, check.IsNil)
c.Assert(len(routes), check.Equals, 0)

routes, err = app.getMachinePrimaryRoutes(&machine2)
c.Assert(err, check.IsNil)
c.Assert(len(routes), check.Equals, 2)
}

0 comments on commit 6718ff7

Please sign in to comment.