Skip to content

Commit

Permalink
set erdma resource quota based on EniQuantity
Browse files Browse the repository at this point in the history
to avoid too many normal multiip pod quota consume by ERI

Signed-off-by: bingshen.wbs <[email protected]>
  • Loading branch information
BSWANG committed Sep 13, 2024
1 parent 325d07e commit 7689000
Show file tree
Hide file tree
Showing 8 changed files with 153 additions and 10 deletions.
10 changes: 5 additions & 5 deletions daemon/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ func (b *NetworkServiceBuilder) setupENIManager() error {
return err
}
realRdmaCount := b.limit.ERDMARes()
if b.config.EnableERDMA && len(attached) >= b.limit.Adapters-1-b.limit.ERdmaAdapters {
if b.config.EnableERDMA && len(attached) >= b.limit.Adapters-1-b.limit.ERDMARes() {

Check warning on line 278 in daemon/builder.go

View check run for this annotation

Codecov / codecov/patch

daemon/builder.go#L278

Added line #L278 was not covered by tests
attachedERdma := lo.Filter(attached, func(ni *daemon.ENI, idx int) bool { return ni.ERdma })
if len(attachedERdma) <= 0 {
// turn off only when no one use it
Expand Down Expand Up @@ -359,8 +359,8 @@ func (b *NetworkServiceBuilder) setupENIManager() error {
}
normalENINeeded := poolConfig.MaxENI - normalENICount
if b.config.EnableERDMA {
normalENINeeded = poolConfig.MaxENI - b.limit.ERdmaAdapters - normalENICount
for i := 0; i < b.limit.ERdmaAdapters-erdmaENICount; i++ {
normalENINeeded = poolConfig.MaxENI - b.limit.ERDMARes() - normalENICount
for i := 0; i < b.limit.ERDMARes()-erdmaENICount; i++ {

Check warning on line 363 in daemon/builder.go

View check run for this annotation

Codecov / codecov/patch

daemon/builder.go#L362-L363

Added lines #L362 - L363 were not covered by tests
eniList = append(eniList, eni.NewLocal(nil, "erdma", factory, poolConfig))
}
}
Expand Down Expand Up @@ -389,8 +389,8 @@ func (b *NetworkServiceBuilder) setupENIManager() error {
}
normalENINeeded := poolConfig.MaxENI - normalENICount
if b.config.EnableERDMA {
normalENINeeded = poolConfig.MaxENI - b.limit.ERdmaAdapters - normalENICount
for i := 0; i < b.limit.ERdmaAdapters-erdmaENICount; i++ {
normalENINeeded = poolConfig.MaxENI - b.limit.ERDMARes() - normalENICount
for i := 0; i < b.limit.ERDMARes()-erdmaENICount; i++ {

Check warning on line 393 in daemon/builder.go

View check run for this annotation

Codecov / codecov/patch

daemon/builder.go#L392-L393

Added lines #L392 - L393 were not covered by tests
eniList = append(eniList, eni.NewLocal(nil, "erdma", factory, poolConfig))
}
}
Expand Down
4 changes: 2 additions & 2 deletions daemon/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ func getPoolConfig(cfg *daemon.Config, daemonMode string, limit *client.Limits)

poolConfig.MaxIPPerENI = 1
if cfg.EnableERDMA {
poolConfig.ERdmaCapacity = limit.ERdmaAdapters
poolConfig.ERdmaCapacity = limit.ERDMARes()

Check warning on line 126 in daemon/config.go

View check run for this annotation

Codecov / codecov/patch

daemon/config.go#L126

Added line #L126 was not covered by tests
}
case daemon.ModeENIMultiIP:
maxENI = limit.Adapters
Expand Down Expand Up @@ -161,7 +161,7 @@ func getPoolConfig(cfg *daemon.Config, daemonMode string, limit *client.Limits)
poolConfig.MaxIPPerENI = ipPerENI

if cfg.EnableERDMA {
poolConfig.ERdmaCapacity = limit.ERdmaAdapters * limit.IPv4PerAdapter
poolConfig.ERdmaCapacity = limit.ERDMARes() * limit.IPv4PerAdapter

Check warning on line 164 in daemon/config.go

View check run for this annotation

Codecov / codecov/patch

daemon/config.go#L164

Added line #L164 was not covered by tests
}
}

Expand Down
11 changes: 10 additions & 1 deletion pkg/aliyun/client/limit.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,16 @@ func (l *Limits) MultiIPPod() int {
}

func (l *Limits) ERDMARes() int {
return l.ERdmaAdapters
if l.ERdmaAdapters <= 0 || l.Adapters <= 2 {
return 0
}
// limit adapters
if l.Adapters >= 8 {
// for multi physical network card instance
return min(2, l.ERdmaAdapters)
}
// limit normal ecs eri to 1, to avoid too many normal multiip pod quota consume
return min(1, l.ERdmaAdapters)
}

func (l *Limits) ExclusiveENIPod() int {
Expand Down
86 changes: 86 additions & 0 deletions pkg/aliyun/client/limit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,92 @@ func TestGetInstanceType(t *testing.T) {
}
}

func TestGetERIRes(t *testing.T) {
tests := []struct {
name string
input *ecs.InstanceType
expected int
}{
{
name: "not support instance type",
input: &ecs.InstanceType{
EniQuantity: 2,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 6,
EriQuantity: 0,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 0,
},
{
name: "Small instance type",
input: &ecs.InstanceType{
EniQuantity: 2,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 6,
EriQuantity: 2,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 0,
},
{
name: "Basic instance type",
input: &ecs.InstanceType{
EniQuantity: 4,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 6,
EriQuantity: 2,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 1,
},
{
name: "giant instance type only one eri",
input: &ecs.InstanceType{
EniQuantity: 8,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 10,
EriQuantity: 1,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 1,
},
{
name: "giant instance type",
input: &ecs.InstanceType{
EniQuantity: 8,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 10,
EriQuantity: 4,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 2,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := getInstanceType(tt.input)
assert.Equal(t, tt.expected, actual.ERDMARes())
})
}
}

func TestECSLimitProvider_GetLimitFromAnno(t *testing.T) {

type args struct {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ func (r *ReconcileNode) createOrUpdate(ctx context.Context, k8sNode *corev1.Node
InstanceBandwidthTx: limit.InstanceBandwidthTx,
InstanceBandwidthRx: limit.InstanceBandwidthRx,
Adapters: limit.Adapters,
EriQuantity: limit.ERdmaAdapters,
EriQuantity: limit.ERDMARes(),
TotalAdapters: limit.TotalAdapters,
IPv6PerAdapter: limit.IPv6PerAdapter,
MemberAdapterLimit: limit.MemberAdapterLimit,
Expand Down
6 changes: 5 additions & 1 deletion plugin/datapath/policy_router_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,11 @@ func (d *PolicyRoute) Setup(cfg *types.SetupConfig, netNS ns.NetNS) error {
}

if cfg.ERDMA {
err = smc.ConfigSMCForDevice("erdma_0", cfg.ContainerIfName, netNS)
rdmaDev, err := utils.GetERdmaFromLink(eni)
if err != nil {
return fmt.Errorf("error get erdma device: %w", err)

Check warning on line 356 in plugin/datapath/policy_router_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/datapath/policy_router_linux.go#L354-L356

Added lines #L354 - L356 were not covered by tests
}
err = smc.ConfigSMCForDevice(rdmaDev.Attrs.Name, cfg.ContainerIfName, netNS)

Check warning on line 358 in plugin/datapath/policy_router_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/datapath/policy_router_linux.go#L358

Added line #L358 was not covered by tests
if err != nil {
return fmt.Errorf("error setup pnet config for pod: %w", err)
}
Expand Down
38 changes: 38 additions & 0 deletions plugin/driver/utils/utils_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"fmt"
"net"
"os"
"strconv"
"strings"

terwayIP "github.com/AliyunContainerService/terway/pkg/ip"
terwaySysctl "github.com/AliyunContainerService/terway/pkg/sysctl"
Expand Down Expand Up @@ -844,3 +846,39 @@ func CleanIPRules() (err error) {

return nil
}

func GetERdmaFromLink(link netlink.Link) (*netlink.RdmaLink, error) {
rdmaLinks, err := netlink.RdmaLinkList()
if err != nil {
return nil, fmt.Errorf("error list rdma links, %v", err)

Check warning on line 853 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L850-L853

Added lines #L850 - L853 were not covered by tests
}
for _, rl := range rdmaLinks {
rdmaHwAddr, err := parseERdmaLinkHwAddr(rl.Attrs.NodeGuid)
if err != nil {
return nil, err

Check warning on line 858 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L855-L858

Added lines #L855 - L858 were not covered by tests
}
linkHwAddr := link.Attrs().HardwareAddr

Check warning on line 860 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L860

Added line #L860 was not covered by tests
// erdma guid first byte is ^= 0x2
linkHwAddr[0] ^= 0x2
if rdmaHwAddr.String() == linkHwAddr.String() {
return rl, nil

Check warning on line 864 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L862-L864

Added lines #L862 - L864 were not covered by tests
}
}
return nil, fmt.Errorf("cannot found rdma link for %s", link.Attrs().Name)

Check warning on line 867 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L867

Added line #L867 was not covered by tests
}

func parseERdmaLinkHwAddr(guid string) (net.HardwareAddr, error) {
hwAddrSlice := make([]byte, 8)
guidSlice := strings.Split(guid, ":")
if len(guidSlice) != 8 {
return nil, fmt.Errorf("invalid rdma guid: %s", guid)

Check warning on line 874 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L874

Added line #L874 was not covered by tests
}
for i, s := range guidSlice {
sint, err := strconv.ParseUint(s, 16, 8)
if err != nil {
return nil, fmt.Errorf("invalid rdma guid: %s, err: %v", guid, err)

Check warning on line 879 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L879

Added line #L879 was not covered by tests
}
hwAddrSlice[7-i] = uint8(sint)
}
return append(hwAddrSlice[0:3], hwAddrSlice[5:8]...), nil
}
6 changes: 6 additions & 0 deletions plugin/driver/utils/utils_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,9 @@ var _ = Describe("Test TC filter", func() {
Expect(err).NotTo(HaveOccurred())
})
})

func TestParseERdmaLinkHwAddress(t *testing.T) {
hwaddr, err := parseERdmaLinkHwAddr("0d:d3:04:fe:ff:3e:16:02")
assert.NoError(t, err)
assert.Equal(t, "02:16:3e:04:d3:0d", hwaddr.String())
}

0 comments on commit 7689000

Please sign in to comment.