Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,21 @@
logger = get_logger(__name__)
DCR_API_VERSION = "2022-06-01"

ContainerInsightsStreams = [
"Microsoft-ContainerLog",
"Microsoft-ContainerLogV2-HighScale",
"Microsoft-KubeEvents",
"Microsoft-KubePodInventory",
"Microsoft-KubeNodeInventory",
"Microsoft-KubePVInventory",
"Microsoft-KubeServices",
"Microsoft-KubeMonAgentEvents",
"Microsoft-InsightsMetrics",
"Microsoft-ContainerInventory",
"Microsoft-ContainerNodeInventory",
Comment thread
wanlonghenry marked this conversation as resolved.
"Microsoft-Perf",
]


class ContainerInsights(DefaultExtension):
def Create(self, cmd, client, resource_group_name, cluster_name, name, cluster_type, cluster_rp,
Expand Down Expand Up @@ -83,6 +98,7 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
# Delete DCR-A if it exists incase of MSI Auth
useAADAuth = False
isDCRAExists = False
enable_high_log_scale_mode = False
cluster_rp, _ = get_cluster_rp_api_version(cluster_type=cluster_type, cluster_rp=cluster_rp)
try:
extension = client.get(resource_group_name, cluster_rp, cluster_type, cluster_name, name)
Expand All @@ -95,10 +111,15 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
return

subscription_id = get_subscription_id(cmd.cli_ctx)
resources = cf_resources(cmd.cli_ctx, subscription_id)
# handle cluster type here
cluster_resource_id = '/subscriptions/{0}/resourceGroups/{1}/providers/{2}/{3}/{4}'.format(subscription_id, resource_group_name, cluster_rp, cluster_type, cluster_name)
workspace_resource_id = None
if (extension is not None) and (extension.configuration_settings is not None):
configSettings = extension.configuration_settings
# Extract workspace resource ID if present
if 'logAnalyticsWorkspaceResourceID' in configSettings:
workspace_resource_id = configSettings['logAnalyticsWorkspaceResourceID']
# omsagent is being renamed to ama-logs. Check for both for compatibility
if 'omsagent.useAADAuth' in configSettings:
useAADAuthSetting = configSettings['omsagent.useAADAuth']
Expand All @@ -108,6 +129,16 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
useAADAuthSetting = configSettings['amalogs.useAADAuth']
if (isinstance(useAADAuthSetting, str) and str(useAADAuthSetting).lower() == "true") or (isinstance(useAADAuthSetting, bool) and useAADAuthSetting):
useAADAuth = True

# Check if high log scale mode was enabled
if useAADAuth and 'amalogs.enableHighLogScaleMode' in configSettings:
highLogScaleSetting = configSettings['amalogs.enableHighLogScaleMode']
if isinstance(highLogScaleSetting, str):
enable_high_log_scale_mode = (highLogScaleSetting.lower() == "true")
elif isinstance(highLogScaleSetting, bool):
enable_high_log_scale_mode = highLogScaleSetting
Comment thread
wanlonghenry marked this conversation as resolved.
else:
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true/false or boolean type')
if useAADAuth:
association_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{cluster_resource_id}/providers/Microsoft.Insights/dataCollectionRuleAssociations/ContainerInsightsExtension?api-version={DCR_API_VERSION}"
for _ in range(3):
Expand All @@ -131,6 +162,41 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
except Exception:
pass # its OK to ignore the exception since MSI auth in preview

if useAADAuth:
# Get the workspace region if workspace_resource_id is available
workspace_region = None
if workspace_resource_id:
try:
workspace_resource = resources.get_by_id(workspace_resource_id, '2015-11-01-preview')
workspace_region = workspace_resource.location.replace(" ", "").lower()
except Exception as ex:
logger.warning("Skipping DCR and DCE deletion due to inability to determine workspace region")
return
# If workspace_region still couldn't be determined, skip deletion
if not workspace_region:
logger.warning("Workspace region could not be determined. Skipping DCR and DCE deletion.")
return

# Use workspace_region for DCR name to match creation logic
dcr_name = f"MSCI-{workspace_region}-{cluster_name}"
dcr_name = dcr_name[0:64]

dcr_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Insights/dataCollectionRules/{dcr_name}"
dcr_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{dcr_resource_id}?api-version={DCR_API_VERSION}"
response = send_raw_request(cmd.cli_ctx, "GET", dcr_url)
dcr_config = json.loads(response.text)
# Delete the DCR
for _ in range(3):
try:
send_raw_request(cmd.cli_ctx, "DELETE", dcr_url,)
logger.info(f"Successfully deleted DCR: {dcr_name}")
break
except Exception as ex:
logger.warning(f"Error deleting DCR: {str(ex)}")
pass

if enable_high_log_scale_mode:
_delete_dce_for_dcr(cmd, subscription_id, resource_group_name, dcr_config)

# Custom Validation Logic for Container Insights

Expand Down Expand Up @@ -464,6 +530,7 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
subscription_id = get_subscription_id(cmd.cli_ctx)
workspace_resource_id = ''
useAADAuth = True
enableHighLogScaleMode = False # Default value
if 'amalogs.useAADAuth' not in configuration_settings:
configuration_settings['amalogs.useAADAuth'] = "true"
extensionSettings = {}
Expand Down Expand Up @@ -520,6 +587,16 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
raise InvalidArgumentValueError('streams must be an array type')
extensionSettings["dataCollectionSettings"] = dataCollectionSettings

if useAADAuth and 'amalogs.enableHighLogScaleMode' in configuration_settings:
enableHighLogScaleMode = configuration_settings['amalogs.enableHighLogScaleMode']
if isinstance(enableHighLogScaleMode, str):
enableHighLogScaleMode_str = enableHighLogScaleMode.lower()
if enableHighLogScaleMode_str not in ["true", "false"]:
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true or false')
enableHighLogScaleMode = (enableHighLogScaleMode_str == "true")
elif not isinstance(enableHighLogScaleMode, bool):
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true or false')

workspace_resource_id = workspace_resource_id.strip()

if configuration_protected_settings is not None:
Expand Down Expand Up @@ -548,7 +625,7 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
if is_ci_extension_type:
if useAADAuth:
logger.info("creating data collection rule and association")
_ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings)
_ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings, enableHighLogScaleMode)
elif not _is_container_insights_solution_exists(cmd, workspace_resource_id):
logger.info("Creating ContainerInsights solution resource, since it doesn't exist and it is using legacy authentication")
_ensure_container_insights_for_monitoring(cmd, workspace_resource_id).result()
Expand Down Expand Up @@ -597,6 +674,37 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
configuration_settings['amalogs.domain'] = 'opinsights.azure.microsoft.scloud'


def _delete_dce_for_dcr(cmd, subscription_id, cluster_resource_group_name, dcr_config):
"""Delete Data Collection Endpoint associated with a DCR if it exists"""
try:
if ("properties" in dcr_config and
"dataCollectionEndpointId" in dcr_config["properties"] and
dcr_config["properties"]["dataCollectionEndpointId"]):

dce_id = dcr_config["properties"]["dataCollectionEndpointId"]
dce_parts = dce_id.split('/')

if len(dce_parts) > 0:
dce_name = dce_parts[-1]
dce_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionEndpoints/{dce_name}"
dce_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{dce_resource_id}?api-version=2022-06-01"
Comment thread
wanlonghenry marked this conversation as resolved.
# Try to delete up to 3 times
for retry in range(3):
try:
send_raw_request(cmd.cli_ctx, "DELETE", dce_url)
logger.info("Successfully deleted DCE: %s", dce_name)
return True
except CLIError as e:
if "ResourceNotFound" in str(e):
return True
if retry == 2:
logger.warning("Failed to delete DCE: %s - %s", dce_name, str(e))
return False
logger.info("Retrying DCE deletion after error: %s", str(e))
except CLIError:
pass
return True

def get_existing_container_insights_extension_dcr_tags(cmd, dcr_url):
tags = {}
_MAX_RETRY_TIMES = 3
Expand All @@ -617,7 +725,7 @@ def get_existing_container_insights_extension_dcr_tags(cmd, dcr_url):
return tags


def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings):
def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings, enable_high_log_scale_mode):
from azure.core.exceptions import HttpResponseError

cluster_region = ''
Expand Down Expand Up @@ -652,6 +760,18 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
dataCollectionRuleName = dataCollectionRuleName[0:64]
dcr_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionRules/{dataCollectionRuleName}"

# ingestion DCE MUST be in workspace region
ingestionDataCollectionEndpointName = f"MSCI-ingest-{workspace_region}-{cluster_name}"
# Max length of the DCE name is 43 chars
ingestionDataCollectionEndpointName = _trim_suffix_if_needed(ingestionDataCollectionEndpointName[0:43])
ingestion_dce_resource_id = None

# create ingestion DCE if high log scale mode enabled
if enable_high_log_scale_mode:
ingestion_dce_resource_id = create_data_collection_endpoint(
cmd, subscription_id, cluster_resource_group_name, workspace_region, ingestionDataCollectionEndpointName
)

# first get the association between region display names and region IDs (because for some reason
# the "which RPs are available in which regions" check returns region display names)
region_names_to_id = {}
Expand All @@ -677,6 +797,8 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
# get existing tags on the container insights extension DCR if the customer added any
existing_tags = get_existing_container_insights_extension_dcr_tags(cmd, dcr_url)
streams = ["Microsoft-ContainerInsights-Group-Default"]
if enable_high_log_scale_mode:
streams = ContainerInsightsStreams
if extensionSettings is None:
extensionSettings = {}
if 'dataCollectionSettings' in extensionSettings.keys():
Expand All @@ -691,6 +813,11 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
}
extensionSettings["dataCollectionSettings"] = dataCollectionSettings

if enable_high_log_scale_mode:
for i, v in enumerate(streams):
if v == "Microsoft-ContainerLogV2":
streams[i] = "Microsoft-ContainerLogV2-HighScale"
Comment thread
wanlonghenry marked this conversation as resolved.

# create the DCR
dcr_creation_body = json.dumps(
{
Expand Down Expand Up @@ -722,6 +849,7 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
}
]
},
"dataCollectionEndpointId": ingestion_dce_resource_id
Comment thread
wanlonghenry marked this conversation as resolved.
},
}
)
Expand Down Expand Up @@ -755,3 +883,34 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
error = e
else:
raise error


def create_data_collection_endpoint(cmd, subscription_id, cluster_resource_group_name, workspace_region, ingestionDataCollectionEndpointName):
# create the ingestion DCE
ingestion_dce_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionEndpoints/{ingestionDataCollectionEndpointName}"
ingestion_dce_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{ingestion_dce_resource_id}?api-version=2022-06-01"
ingestion_dce_creation_body = json.dumps({
"location": workspace_region,
"kind": "Linux",
"properties": {
"networkAcls": {
"publicNetworkAccess": "Enabled"
}
}
})
error = None
for _ in range(3):
try:
send_raw_request(cmd.cli_ctx, "PUT", ingestion_dce_url, body=ingestion_dce_creation_body)
return ingestion_dce_resource_id
except AzCLIError as e:
error = e
if error:
raise error
return ingestion_dce_resource_id


def _trim_suffix_if_needed(s, suffix="-"):
if s.endswith(suffix):
s = s[:-len(suffix)]
return s
4 changes: 4 additions & 0 deletions testing/pipeline/k8s-custom-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ stages:
parameters:
jobName: AzureMonitor
path: ./test/extensions/public/AzureMonitor.Tests.ps1
- template: ./templates/run-test.yml
parameters:
jobName: AzureMonitorHighScale
path: ./test/extensions/public/AzureMonitorHighScale.Tests.ps1
- template: ./templates/run-test.yml
parameters:
jobName: AzurePolicy
Expand Down
75 changes: 75 additions & 0 deletions testing/test/extensions/public/AzureMonitorHighScale.Tests.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
Describe 'Azure Monitor High Scale Mode Testing' {
BeforeAll {
$extensionType = "microsoft.azuremonitor.containers"
$extensionName = "azuremonitor-containers"
$extensionAgentName = "omsagent"
$extensionAgentNamespace = "kube-system"

. $PSScriptRoot/../../helper/Constants.ps1
. $PSScriptRoot/../../helper/Helper.ps1
}

It 'Creates the extension with high log scale mode and checks that it onboards correctly' {
az $Env:K8sExtensionName create -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) `
--cluster-type connectedClusters --extension-type $extensionType -n $extensionName `
--configuration-settings "amalogs.enableHighLogScaleMode=true" --no-wait
$? | Should -BeTrue

$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
$? | Should -BeTrue

$extension = ($output | ConvertFrom-Json)
$isAutoUpgradeMinorVersion = $extension.autoUpgradeMinorVersion
$isAutoUpgradeMinorVersion.ToString() -eq "True" | Should -BeTrue

# Verify high scale mode configuration
$settings = $extension.configurationSettings
$settings.'amalogs.enableHighLogScaleMode' | Should -Be "true"

# Loop and retry until the extension installs
$n = 0
do
{
if (Has-ExtensionData $extensionName) {
break
}
Start-Sleep -Seconds 10
$n += 1
} while ($n -le $MAX_RETRY_ATTEMPTS)
$n | Should -BeLessOrEqual $MAX_RETRY_ATTEMPTS
}

It "Performs a show on the extension" {
$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
$? | Should -BeTrue
$output | Should -Not -BeNullOrEmpty
}

It "Lists the extensions on the cluster" {
$output = az $Env:K8sExtensionName list -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters
$? | Should -BeTrue

$output | Should -Not -BeNullOrEmpty
$extensionExists = $output | ConvertFrom-Json | Where-Object { $_.extensionType -eq $extensionType }
$extensionExists | Should -Not -BeNullOrEmpty
}

It "Deletes the extension from the cluster" {
$output = az $Env:K8sExtensionName delete -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName --force
$? | Should -BeTrue

# Extension should not be found on the cluster
$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
$? | Should -BeFalse
$output | Should -BeNullOrEmpty
}

It "Performs another list after the delete" {
$output = az $Env:K8sExtensionName list -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters
$? | Should -BeTrue
$output | Should -Not -BeNullOrEmpty

$extensionExists = $output | ConvertFrom-Json | Where-Object { $_.extensionType -eq $extensionName }
$extensionExists | Should -BeNullOrEmpty
}
}
Loading