openshift · openshift-merge-robot · Oct 12, 2022 · Oct 7, 2022 · Sep 7, 2022 · Sep 12, 2022
diff --git a/control-plane-operator/controllers/hostedcontrolplane/nto/clusternodetuningoperator.go b/control-plane-operator/controllers/hostedcontrolplane/nto/clusternodetuningoperator.go
@@ -146,7 +146,7 @@ func ReconcileDeployment(dep *appsv1.Deployment, params Params) error {
 	}
 
 	ntoArgs := []string{
-		"-v=2",
+		"-v=0",
 	}
 
 	var ntoEnv []corev1.EnvVar

diff --git a/docs/content/how-to/node-tuning.md b/docs/content/how-to/node-tuning.md
@@ -1,6 +1,7 @@
 # Manage node-level tuning with the Node Tuning Operator
 
-If you would like to set some node-level tuning on the nodes in your hosted cluster, you can use the [Node Tuning Operator](https://docs.openshift.com/container-platform/4.11/scalability_and_performance/using-node-tuning-operator.html). In HyperShift, node tuning can be configured by creating ConfigMaps which contain Tuned objects, and referencing these ConfigMaps in your NodePools. Currently Node Tuning is limited to tunables which the TuneD daemon can apply directly like setting `sysctl` values. Tuning that requires setting kernel boot parameters is not yet supported in HyperShift.
+## Creating a simple TuneD profile for setting sysctl settings
+If you would like to set some node-level tuning on the nodes in your hosted cluster, you can use the [Node Tuning Operator](https://docs.openshift.com/container-platform/latest/scalability_and_performance/using-node-tuning-operator.html). In HyperShift, node tuning can be configured by creating ConfigMaps which contain Tuned objects, and referencing these ConfigMaps in your NodePools.
 
 1. Create a ConfigMap which contains a valid Tuned manifest and reference it in a NodePool. The example Tuned manifest below defines a profile which sets `vm.dirty_ratio` to 55, on Nodes which contain the Node label  `tuned-1-node-label` with any value. 
 
@@ -84,7 +85,7 @@ If you would like to set some node-level tuning on the nodes in your hosted clus
     nodepool-1-worker-2            tuned-1-profile  True      False      7m14s
     ```
 
-    As we can see, both worker nodes in the nodepool have the tuned-1-profile applied. Note that if no custom profiles are created, the `openshift-node` profile will be applied by default.
+    As we can see, both worker nodes in the NodePool have the tuned-1-profile applied. Note that if no custom profiles are created, the `openshift-node` profile will be applied by default.
 
 
 3. To confirm the tuning was applied correctly, we can start a debug shell on a Node and check the sysctl values:
@@ -95,4 +96,126 @@ If you would like to set some node-level tuning on the nodes in your hosted clus
     Example output:
     ```
     vm.dirty_ratio = 55
+    ```
+
+## Applying tuning which requires kernel boot parameters 
+You can also use the Node Tuning Operator for more complex tuning which requires setting kernel boot parameters. 
+As an example, the following steps can be followed to create a NodePool with huge pages reserved.
+
+1. Create the following ConfigMap which contains a Tuned object manifest for creating 10 hugepages of size 2M.
+
+    Save this ConfigMap manifest in a file called `tuned-hugepages.yaml`:
+    ```
+    apiVersion: v1
+    kind: ConfigMap
+    metadata:
+      name: tuned-hugepages
+      namespace: clusters
+    data:
+      tuned: |
+        apiVersion: tuned.openshift.io/v1
+        kind: Tuned
+        metadata:
+          name: hugepages
+          namespace: openshift-cluster-node-tuning-operator
+        spec:
+          profile:
+          - data: |
+              [main]
+              summary=Boot time configuration for hugepages
+              include=openshift-node
+              [bootloader]
+              cmdline_openshift_node_hugepages=hugepagesz=2M hugepages=50
+            name: openshift-node-hugepages
+          recommend:
+          - priority: 20
+            profile: openshift-node-hugepages
+    ```
+    > **_NOTE:_**  The `.spec.recommend.match` field is intentionally left blank. In this case this Tuned will be applied to all Nodes in the NodePool where this ConfigMap is referenced. It is advised to group Nodes with the same hardware configuration into the same NodePool. Not following this practice might result in TuneD operands calculating conflicting kernel parameters for two or more nodes sharing the same NodePool.
+
+    Create the ConfigMap in the management cluster:
+    ```
+    oc --kubeconfig="$MGMT_KUBECONFIG" create -f tuned-hugepages.yaml
+    ```
+
+2. Create a new NodePool manifest YAML file, customize the NodePools upgrade type, and reference the previously created ConfigMap in the `spec.tunedConfig` section before creating it in the management cluster.
+
+    Create the NodePool manifest and save it in a file called `hugepages-nodepool.yaml`:
+    ```
+    NODEPOOL_NAME=hugepages-example
+    INSTANCE_TYPE=m5.2xlarge
+    NODEPOOL_REPLICAS=2
+
+    hypershift create nodepool aws \
+      --cluster-name $CLUSTER_NAME \
+      --name $NODEPOOL_NAME \
+      --node-count $NODEPOOL_REPLICAS \
+      --instance-type $INSTANCE_TYPE \
+      --render > hugepages-nodepool.yaml
+    ```
+
+    Edit `hugepages-nodepool.yaml`. Set `.spec.management.upgradeType` to `InPlace`, and set `.spec.tunedConfig` to reference the `tuned-hugepages` ConfigMap you created.
+    ```
+    apiVersion: hypershift.openshift.io/v1alpha1
+    kind: NodePool
+    metadata:
+      name: hugepages-nodepool
+      namespace: clusters
+      ...
+    spec:
+      management:
+        ...
+        upgradeType: InPlace
+      ...
+      tunedConfig:
+      - name: tuned-hugepages
+    ```
+    > **_NOTE:_**  Setting `.spec.management.upgradeType` to `InPlace` is recommended to avoid unnecessary Node recreations when applying the new MachineConfigs. With the `Replace` upgrade type, Nodes will be fully deleted and new nodes will replace them when applying the new kernel boot parameters that are calculated by the TuneD operand.
+
+    Create the NodePool in the management cluster:
+    ```
+    oc --kubeconfig="$MGMT_KUBECONFIG" create -f hugepages-nodepool.yaml
+    ```
+
+
+3. After the Nodes become available, the containerized TuneD daemon will calculate the required kernel boot parameters based on the applied TuneD profile. After the Nodes become `Ready` and reboot once to apply the generated MachineConfig, you can verify that the Tuned profile is applied and that the kernel boot parameters have been set.
+
+    List the Tuned objects in the hosted cluster:
+    ```
+    oc --kubeconfig="$HC_KUBECONFIG" get Tuneds -n openshift-cluster-node-tuning-operator
+    ```
+
+    Example output:
+    ```
+    NAME                 AGE
+    default              123m
+    hugepages-8dfb1fed   1m23s
+    rendered             123m
+    ```
+
+    List the Profiles in the hosted cluster:
+    ```
+    oc --kubeconfig="$HC_KUBECONFIG" get Profiles -n openshift-cluster-node-tuning-operator
+    ```
+
+    Example output:
+    ```
+    NAME                           TUNED                      APPLIED   DEGRADED   AGE
+    nodepool-1-worker-1            openshift-node             True      False      132m
+    nodepool-1-worker-2            openshift-node             True      False      131m
+    hugepages-nodepool-worker-1    openshift-node-hugepages   True      False      4m8s
+    hugepages-nodepool-worker-2    openshift-node-hugepages   True      False      3m57s
+    ```
+
+    Both worker nodes in the new NodePool have the `openshift-node-hugepages` profile applied.
+
+
+4. To confirm the tuning was applied correctly, we can start a debug shell on a Node and check `/proc/cmdline`
+    ```
+    oc --kubeconfig="$HC_KUBECONFIG" debug node/nodepool-1-worker-1 -- chroot /host cat /proc/cmdline
+    ```
+
+    Example output:
+    ```
+    BOOT_IMAGE=(hd0,gpt3)/ostree/rhcos-... hugepagesz=2M hugepages=50
     ```
diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go
@@ -81,8 +81,9 @@ const (
 	TokenSecretConfigKey                      = "config"
 	TokenSecretAnnotation                     = "hypershift.openshift.io/ignition-config"
 
-	tunedConfigKey      = "tuned"
-	tunedConfigMapLabel = "hypershift.openshift.io/tuned-config"
+	tunedConfigKey                 = "tuned"
+	tunedConfigMapLabel            = "hypershift.openshift.io/tuned-config"
+	nodeTuningGeneratedConfigLabel = "hypershift.openshift.io/nto-generated-machine-config"
 
 	controlPlaneOperatorManagesDecompressAndDecodeConfig = "io.openshift.hypershift.control-plane-operator-manages.decompress-decode-config"
 )
@@ -938,6 +939,9 @@ func reconcileUserDataSecret(userDataSecret *corev1.Secret, nodePool *hyperv1.No
 	return nil
 }
 
+// reconcileTunedConfigMap inserts the Tuned object manifest in tunedConfig into ConfigMap tunedConfigMap.
+// This is used to mirror the Tuned object manifest into the control plane namespace, for the Node
+// Tuning Operator to mirror and reconcile in the hosted cluster.
 func reconcileTunedConfigMap(tunedConfigMap *corev1.ConfigMap, nodePool *hyperv1.NodePool, tunedConfig string) error {
 	tunedConfigMap.Immutable = k8sutilspointer.BoolPtr(false)
 	if tunedConfigMap.Annotations == nil {
@@ -1321,6 +1325,17 @@ func (r *NodePoolReconciler) getConfig(ctx context.Context,
 		configs = append(configs, *configConfigMap)
 	}
 
+	// Look for NTO generated MachineConfigs from the hosted control plane namespace
+	nodeTuningGeneratedConfigs := &corev1.ConfigMapList{}
+	if err := r.List(ctx, nodeTuningGeneratedConfigs, client.MatchingLabels{
+		nodeTuningGeneratedConfigLabel: "true",
+		hyperv1.NodePoolLabel:          nodePool.GetName(),
+	}, client.InNamespace(controlPlaneResource)); err != nil {
+		errors = append(errors, err)
+	}
+
+	configs = append(configs, nodeTuningGeneratedConfigs.Items...)
+
 	for _, config := range configs {
 		manifestRaw := config.Data[TokenSecretConfigKey]
 		manifest, err := defaultAndValidateConfigManifest([]byte(manifestRaw))
@@ -1676,6 +1691,20 @@ func (r *NodePoolReconciler) enqueueNodePoolsForConfig(obj client.Object) []reco
 		return enqueueParentNodePool(obj)
 	}
 
+	// Check if the ConfigMap is generated by an operator in the control plane namespace
+	// corresponding to this nodepool.
+	if _, ok := obj.GetLabels()[nodeTuningGeneratedConfigLabel]; ok {
+		nodePoolName := obj.GetLabels()[hyperv1.NodePoolLabel]
+		nodePoolNamespacedName, err := r.getNodePoolNamespacedName(nodePoolName, obj.GetNamespace())
+		if err != nil {
+			return result
+		}
+		obj.SetAnnotations(map[string]string{
+			nodePoolAnnotation: nodePoolNamespacedName.String(),
+		})
+		return enqueueParentNodePool(obj)
+	}
+
 	// Otherwise reconcile NodePools which are referencing the given ConfigMap.
 	for key := range nodePoolList.Items {
 		reconcileNodePool := false
@@ -1706,6 +1735,24 @@ func (r *NodePoolReconciler) enqueueNodePoolsForConfig(obj client.Object) []reco
 	return result
 }
 
+// getNodePoolNamespace returns the namespaced name of a NodePool, given the NodePools name
+// and the control plane namespace name for the hosted cluster that this NodePool is a part of.
+func (r *NodePoolReconciler) getNodePoolNamespacedName(nodePoolName string, controlPlaneNamespace string) (types.NamespacedName, error) {
+	hcpList := &hyperv1.HostedControlPlaneList{}
+	if err := r.List(context.Background(), hcpList, &client.ListOptions{
+		Namespace: controlPlaneNamespace,
+	}); err != nil || len(hcpList.Items) < 1 {
+		return types.NamespacedName{Name: nodePoolName}, err
+	}
+	hostedCluster, ok := hcpList.Items[0].Annotations[hostedcluster.HostedClusterAnnotation]
+	if !ok {
+		return types.NamespacedName{Name: nodePoolName}, fmt.Errorf("failed to get Hosted Cluster name for HostedControlPlane %s", hcpList.Items[0].Name)
+	}
+	nodePoolNamespace := supportutil.ParseNamespacedName(hostedCluster).Namespace
+
+	return types.NamespacedName{Name: nodePoolName, Namespace: nodePoolNamespace}, nil
+}
+
 func enqueueParentNodePool(obj client.Object) []reconcile.Request {
 	var nodePoolName string
 	if obj.GetAnnotations() != nil {

diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller_test.go b/hypershift-operator/controllers/nodepool/nodepool_controller_test.go
@@ -13,6 +13,7 @@ import (
 	imagev1 "github.com/openshift/api/image/v1"
 	api "github.com/openshift/hypershift/api"
 	hyperv1 "github.com/openshift/hypershift/api/v1alpha1"
+	"github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster"
 	"github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
 	"github.com/openshift/hypershift/support/releaseinfo"
 	"github.com/openshift/hypershift/support/thirdparty/library-go/pkg/image/dockerv1client"
@@ -21,6 +22,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
 	k8sutilspointer "k8s.io/utils/pointer"
 	capiaws "sigs.k8s.io/cluster-api-provider-aws/api/v1beta1"
@@ -1518,6 +1520,85 @@ func TestGetName(t *testing.T) {
 	g.Expect(alphaNumeric.MatchString(string(name[0]))).To(BeTrue())
 }
 
+func TestGetNodePoolNamespacedName(t *testing.T) {
+	testControlPlaneNamespace := "control-plane-ns"
+	testNodePoolNamespace := "clusters"
+	testNodePoolName := "nodepool-1"
+	testCases := []struct {
+		name                  string
+		nodePoolName          string
+		controlPlaneNamespace string
+		hostedControlPlane    *hyperv1.HostedControlPlane
+		expect                string
+		error                 bool
+	}{
+		{
+			name:                  "gets correct NodePool namespaced name",
+			nodePoolName:          testNodePoolName,
+			controlPlaneNamespace: testControlPlaneNamespace,
+			hostedControlPlane: &hyperv1.HostedControlPlane{
+				ObjectMeta: metav1.ObjectMeta{
+					Namespace: testControlPlaneNamespace,
+					Annotations: map[string]string{
+						hostedcluster.HostedClusterAnnotation: types.NamespacedName{Name: "hosted-cluster-1", Namespace: testNodePoolNamespace}.String(),
+					},
+				},
+			},
+			expect: types.NamespacedName{Name: testNodePoolName, Namespace: testNodePoolNamespace}.String(),
+			error:  false,
+		},
+		{
+			name:                  "fails if HostedControlPlane missing HostedClusterAnnotation",
+			nodePoolName:          testNodePoolName,
+			controlPlaneNamespace: testControlPlaneNamespace,
+			hostedControlPlane: &hyperv1.HostedControlPlane{
+				ObjectMeta: metav1.ObjectMeta{
+					Namespace: testControlPlaneNamespace,
+				},
+			},
+			expect: "",
+			error:  true,
+		},
+		{
+			name:                  "fails if HostedControlPlane does not exist",
+			nodePoolName:          testNodePoolName,
+			controlPlaneNamespace: testControlPlaneNamespace,
+			hostedControlPlane:    nil,
+			expect:                "",
+			error:                 true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			g := NewWithT(t)
+
+			var r NodePoolReconciler
+			if tc.hostedControlPlane == nil {
+				r = NodePoolReconciler{
+					Client: fake.NewClientBuilder().WithObjects().Build(),
+				}
+			} else {
+				r = NodePoolReconciler{
+					Client: fake.NewClientBuilder().WithScheme(api.Scheme).WithObjects(tc.hostedControlPlane).Build(),
+				}
+			}
+
+			got, err := r.getNodePoolNamespacedName(testNodePoolName, testControlPlaneNamespace)
+
+			if tc.error {
+				g.Expect(err).To(HaveOccurred())
+				return
+			}
+			g.Expect(err).ToNot(HaveOccurred())
+			if diff := cmp.Diff(got.String(), tc.expect); diff != "" {
+				t.Errorf("actual NodePool namespaced name differs from expected: %s", diff)
+				t.Logf("got: %s \n, expected: \n %s", got, tc.expect)
+			}
+		})
+	}
+}
+
 func TestSetExpirationTimestampOnToken(t *testing.T) {
 	fakeName := "test-token"
 	fakeNamespace := "master-cluster1"
-Original file line number
+Diff line change
@@ Expand Up @@
     	}
     	ntoArgs := []string{
-    		"-v=2",
+    		"-v=0",
     	}
     	var ntoEnv []corev1.EnvVar
@@ Expand Down @@