Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1.9]set erdma resource quota based on EniQuantity to avoid too many normal multiip pod quota consume by ERI #693

Merged
merged 1 commit into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions daemon/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@
return err
}
realRdmaCount := b.limit.ERDMARes()
if b.config.EnableERDMA && len(attached) >= b.limit.Adapters-1-b.limit.ERdmaAdapters {
if b.config.EnableERDMA && len(attached) >= b.limit.Adapters-1-b.limit.ERDMARes() {

Check warning on line 278 in daemon/builder.go

View check run for this annotation

Codecov / codecov/patch

daemon/builder.go#L278

Added line #L278 was not covered by tests
attachedERdma := lo.Filter(attached, func(ni *daemon.ENI, idx int) bool { return ni.ERdma })
if len(attachedERdma) <= 0 {
// turn off only when no one use it
Expand Down Expand Up @@ -359,8 +359,8 @@
}
normalENINeeded := poolConfig.MaxENI - normalENICount
if b.config.EnableERDMA {
normalENINeeded = poolConfig.MaxENI - b.limit.ERdmaAdapters - normalENICount
for i := 0; i < b.limit.ERdmaAdapters-erdmaENICount; i++ {
normalENINeeded = poolConfig.MaxENI - b.limit.ERDMARes() - normalENICount
for i := 0; i < b.limit.ERDMARes()-erdmaENICount; i++ {

Check warning on line 363 in daemon/builder.go

View check run for this annotation

Codecov / codecov/patch

daemon/builder.go#L362-L363

Added lines #L362 - L363 were not covered by tests
eniList = append(eniList, eni.NewLocal(nil, "erdma", factory, poolConfig))
}
}
Expand Down Expand Up @@ -389,8 +389,8 @@
}
normalENINeeded := poolConfig.MaxENI - normalENICount
if b.config.EnableERDMA {
normalENINeeded = poolConfig.MaxENI - b.limit.ERdmaAdapters - normalENICount
for i := 0; i < b.limit.ERdmaAdapters-erdmaENICount; i++ {
normalENINeeded = poolConfig.MaxENI - b.limit.ERDMARes() - normalENICount
for i := 0; i < b.limit.ERDMARes()-erdmaENICount; i++ {

Check warning on line 393 in daemon/builder.go

View check run for this annotation

Codecov / codecov/patch

daemon/builder.go#L392-L393

Added lines #L392 - L393 were not covered by tests
eniList = append(eniList, eni.NewLocal(nil, "erdma", factory, poolConfig))
}
}
Expand Down
4 changes: 2 additions & 2 deletions daemon/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@

poolConfig.MaxIPPerENI = 1
if cfg.EnableERDMA {
poolConfig.ERdmaCapacity = limit.ERdmaAdapters
poolConfig.ERdmaCapacity = limit.ERDMARes()

Check warning on line 126 in daemon/config.go

View check run for this annotation

Codecov / codecov/patch

daemon/config.go#L126

Added line #L126 was not covered by tests
}
case daemon.ModeENIMultiIP:
maxENI = limit.Adapters
Expand Down Expand Up @@ -161,7 +161,7 @@
poolConfig.MaxIPPerENI = ipPerENI

if cfg.EnableERDMA {
poolConfig.ERdmaCapacity = limit.ERdmaAdapters * limit.IPv4PerAdapter
poolConfig.ERdmaCapacity = limit.ERDMARes() * limit.IPv4PerAdapter

Check warning on line 164 in daemon/config.go

View check run for this annotation

Codecov / codecov/patch

daemon/config.go#L164

Added line #L164 was not covered by tests
}
}

Expand Down
11 changes: 10 additions & 1 deletion pkg/aliyun/client/limit.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,16 @@ func (l *Limits) MultiIPPod() int {
}

func (l *Limits) ERDMARes() int {
return l.ERdmaAdapters
if l.ERdmaAdapters <= 0 || l.Adapters <= 2 {
return 0
}
// limit adapters
if l.Adapters >= 8 {
// for multi physical network card instance
return min(2, l.ERdmaAdapters)
}
// limit normal ecs eri to 1, to avoid too many normal multiip pod quota consume
return min(1, l.ERdmaAdapters)
}

func (l *Limits) ExclusiveENIPod() int {
Expand Down
86 changes: 86 additions & 0 deletions pkg/aliyun/client/limit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,92 @@ func TestGetInstanceType(t *testing.T) {
}
}

func TestGetERIRes(t *testing.T) {
tests := []struct {
name string
input *ecs.InstanceType
expected int
}{
{
name: "not support instance type",
input: &ecs.InstanceType{
EniQuantity: 2,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 6,
EriQuantity: 0,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 0,
},
{
name: "Small instance type",
input: &ecs.InstanceType{
EniQuantity: 2,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 6,
EriQuantity: 2,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 0,
},
{
name: "Basic instance type",
input: &ecs.InstanceType{
EniQuantity: 4,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 6,
EriQuantity: 2,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 1,
},
{
name: "giant instance type only one eri",
input: &ecs.InstanceType{
EniQuantity: 8,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 10,
EriQuantity: 1,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 1,
},
{
name: "giant instance type",
input: &ecs.InstanceType{
EniQuantity: 8,
EniPrivateIpAddressQuantity: 5,
EniIpv6AddressQuantity: 10,
EniTotalQuantity: 10,
EriQuantity: 4,
InstanceBandwidthRx: 1000,
InstanceBandwidthTx: 500,
EniTrunkSupported: true,
},
expected: 2,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := getInstanceType(tt.input)
assert.Equal(t, tt.expected, actual.ERDMARes())
})
}
}

func TestECSLimitProvider_GetLimitFromAnno(t *testing.T) {

type args struct {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ func (r *ReconcileNode) createOrUpdate(ctx context.Context, k8sNode *corev1.Node
InstanceBandwidthTx: limit.InstanceBandwidthTx,
InstanceBandwidthRx: limit.InstanceBandwidthRx,
Adapters: limit.Adapters,
EriQuantity: limit.ERdmaAdapters,
EriQuantity: limit.ERDMARes(),
TotalAdapters: limit.TotalAdapters,
IPv6PerAdapter: limit.IPv6PerAdapter,
MemberAdapterLimit: limit.MemberAdapterLimit,
Expand Down
6 changes: 5 additions & 1 deletion plugin/datapath/policy_router_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,11 @@
}

if cfg.ERDMA {
err = smc.ConfigSMCForDevice("erdma_0", cfg.ContainerIfName, netNS)
rdmaDev, err := utils.GetERdmaFromLink(eni)
if err != nil {
return fmt.Errorf("error get erdma device: %w", err)

Check warning on line 356 in plugin/datapath/policy_router_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/datapath/policy_router_linux.go#L354-L356

Added lines #L354 - L356 were not covered by tests
}
err = smc.ConfigSMCForDevice(rdmaDev.Attrs.Name, cfg.ContainerIfName, netNS)

Check warning on line 358 in plugin/datapath/policy_router_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/datapath/policy_router_linux.go#L358

Added line #L358 was not covered by tests
if err != nil {
return fmt.Errorf("error setup pnet config for pod: %w", err)
}
Expand Down
38 changes: 38 additions & 0 deletions plugin/driver/utils/utils_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"fmt"
"net"
"os"
"strconv"
"strings"

terwayIP "github.com/AliyunContainerService/terway/pkg/ip"
terwaySysctl "github.com/AliyunContainerService/terway/pkg/sysctl"
Expand Down Expand Up @@ -844,3 +846,39 @@

return nil
}

func GetERdmaFromLink(link netlink.Link) (*netlink.RdmaLink, error) {
rdmaLinks, err := netlink.RdmaLinkList()
if err != nil {
return nil, fmt.Errorf("error list rdma links, %v", err)

Check warning on line 853 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L850-L853

Added lines #L850 - L853 were not covered by tests
}
for _, rl := range rdmaLinks {
rdmaHwAddr, err := parseERdmaLinkHwAddr(rl.Attrs.NodeGuid)
if err != nil {
return nil, err

Check warning on line 858 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L855-L858

Added lines #L855 - L858 were not covered by tests
}
linkHwAddr := link.Attrs().HardwareAddr

Check warning on line 860 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L860

Added line #L860 was not covered by tests
// erdma guid first byte is ^= 0x2
linkHwAddr[0] ^= 0x2
if rdmaHwAddr.String() == linkHwAddr.String() {
return rl, nil

Check warning on line 864 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L862-L864

Added lines #L862 - L864 were not covered by tests
}
}
return nil, fmt.Errorf("cannot found rdma link for %s", link.Attrs().Name)

Check warning on line 867 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L867

Added line #L867 was not covered by tests
}

func parseERdmaLinkHwAddr(guid string) (net.HardwareAddr, error) {
hwAddrSlice := make([]byte, 8)
guidSlice := strings.Split(guid, ":")
if len(guidSlice) != 8 {
return nil, fmt.Errorf("invalid rdma guid: %s", guid)

Check warning on line 874 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L874

Added line #L874 was not covered by tests
}
for i, s := range guidSlice {
sint, err := strconv.ParseUint(s, 16, 8)
if err != nil {
return nil, fmt.Errorf("invalid rdma guid: %s, err: %v", guid, err)

Check warning on line 879 in plugin/driver/utils/utils_linux.go

View check run for this annotation

Codecov / codecov/patch

plugin/driver/utils/utils_linux.go#L879

Added line #L879 was not covered by tests
}
hwAddrSlice[7-i] = uint8(sint)
}
return append(hwAddrSlice[0:3], hwAddrSlice[5:8]...), nil
}
6 changes: 6 additions & 0 deletions plugin/driver/utils/utils_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,9 @@ var _ = Describe("Test TC filter", func() {
Expect(err).NotTo(HaveOccurred())
})
})

func TestParseERdmaLinkHwAddress(t *testing.T) {
hwaddr, err := parseERdmaLinkHwAddr("0d:d3:04:fe:ff:3e:16:02")
assert.NoError(t, err)
assert.Equal(t, "02:16:3e:04:d3:0d", hwaddr.String())
}