Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9ce20d9
Alert triage agent initial commit
hsin-c May 2, 2025
d1030f1
Track data folder with Git LFS
hsin-c May 2, 2025
47834dd
Update examples/alert_triage_agent/README.md
hsin-c May 2, 2025
a2b8b98
Update examples/alert_triage_agent/pyproject.toml
hsin-c May 2, 2025
12c7da2
Address comments
hsin-c May 2, 2025
9faace0
Merge remote-tracking branch 'origin/hsinc-alert-triage-agent' into h…
hsin-c May 2, 2025
bd2679b
Fix str not awaitable issue
hsin-c May 2, 2025
e47204f
Update examples/alert_triage_agent/pyproject.toml
hsin-c May 2, 2025
cc370f2
Run linting
hsin-c May 2, 2025
b5e6178
Fix formatting
hsin-c May 2, 2025
60ea5b6
Precommit run
hsin-c May 2, 2025
0057901
Merge branch 'hsinc-alert-triage-agent' of github.com:hsin-c/AIQToolk…
hsin-c May 2, 2025
b19c20c
Update field name
hsin-c May 2, 2025
a5e2a23
Fix typos in readme; add categorizer as an accepted word
hsin-c May 2, 2025
2fad832
Add a link to live and test modes at the top of sample usage
AnuradhaKaruppiah May 5, 2025
e06ce54
Add symlinks for configs and data
AnuradhaKaruppiah May 5, 2025
b2122bc
Merge remote-tracking branch 'upstream/develop' into hsinc-alert-tria…
hsin-c May 5, 2025
b011045
Update uv.lock
hsin-c May 5, 2025
e27d9fb
Add aiq_alert_triage_agent to the parent pyproject.toml
AnuradhaKaruppiah May 5, 2025
21ced94
Merge remote-tracking branch 'upstream/develop' into hsin-alert-triag…
AnuradhaKaruppiah May 5, 2025
39135ff
Update uv.lock
AnuradhaKaruppiah May 5, 2025
23809c2
Miscellaneous fixes
AnuradhaKaruppiah May 5, 2025
68b4f86
Update README; udpate data parsing
hsin-c May 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ examples/simple_calculator/src/aiq_simple_calculator/data/** filter=lfs diff=lfs
examples/simple/src/aiq_simple/data/** filter=lfs diff=lfs merge=lfs -text
examples/swe_bench/src/aiq_swe_bench/data/** filter=lfs diff=lfs merge=lfs -text
docs/source/_static/*.png filter=lfs diff=lfs merge=lfs -text
examples/alert_triage_agent/src/aiq_alert_triage_agent/data/** filter=lfs diff=lfs merge=lfs -text
35 changes: 35 additions & 0 deletions examples/alert_triage_agent/.env_example
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Example .env file
# Required for accessing NVIDIA AI services
NVIDIA_API_KEY=""

# Set to "true" to run agent in test mode using synthetic data instead of live systems
TEST_MODE=true

# All file paths below should be relative to register.py location

# Path to CSV containing scheduled maintenance window data
MAINTENANCE_STATIC_DATA_PATH="data/maintenance_static_dataset.csv"

# Main test dataset in CSV format containing alerts and their simulated environments
TEST_DATA_RELATIVE_FILEPATH="data/test_data.csv"

# JSON file with baseline/normal system behavior data
TEST_BENIGN_DATA_RELATIVE_FILEPATH="data/benign_fallback_test_data.json"

# Path where test results will be saved as a CSV file
TEST_OUTPUT_RELATIVE_FILEPATH="output/test_output.csv"
329 changes: 329 additions & 0 deletions examples/alert_triage_agent/README.md

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions examples/alert_triage_agent/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[build-system]
build-backend = "setuptools.build_meta"
requires = ["setuptools >= 64", "setuptools-scm>=8"]

[tool.setuptools_scm]
root = "../.."

[project]
name = "aiq_alert_triage_agent"
dynamic = ["version"]
dependencies = [
"aiqtoolkit[langchain]",
"langchain-core, # version determined by aiqtoolkit[langchain]
"pandas>=2.0.0",
"ansible-runner>=2.3.0",
"langgraph>=0.0.10", # version determined by aiqtoolkit[langchain]
"flask>=3.0.0"
]
requires-python = ">=3.11,<3.13"
description = "Alert Triage AIQ Toolkit example"
classifiers = ["Programming Language :: Python"]

[tool.uv.sources]
aiq = { path = "../..", editable = true }

[project.entry-points.'aiq.components']
aiq_alert_triage_agent = "aiq_alert_triage_agent.register"
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder
from pydantic import Field

from aiq.builder.builder import Builder
from aiq.builder.framework_enum import LLMFrameworkEnum
from aiq.builder.function_info import FunctionInfo
from aiq.cli.register_workflow import register_function
from aiq.data_models.component_ref import LLMRef
from aiq.data_models.function import FunctionBaseConfig

from . import utils
from .prompts import PipelineNodePrompts


class CategorizerToolConfig(FunctionBaseConfig, name="categorizer"):
description: str = Field(
default="This is a categorization tool used at the end of the pipeline.",
description="Description of the tool."
)
llm_name: LLMRef


@register_function(config_type=CategorizerToolConfig)
async def categorizer_tool(config: CategorizerToolConfig, builder: Builder):
# Set up LLM and chain
llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
prompt_template = ChatPromptTemplate([("system", PipelineNodePrompts.CATEGORIZER_PROMPT), MessagesPlaceholder("msgs")])
categorization_chain = prompt_template | llm

async def _arun(report: str) -> str:
tool_name = "Root Cause Categorizer"
utils.log_header(tool_name)

result = await categorization_chain.ainvoke({"msgs": [HumanMessage(content=report)]})

# Extract the markdown heading level from first line of report (e.g. '#' or '##')
pound_signs = report.split('\n')[0].split(' ')[0]

# Format the root cause category section:
# - Add newlines before and after section
# - Use extracted heading level for consistency
# - Add extra newline between category and reasoning for readability
report_section = f"""\n\n{pound_signs} Root Cause Category\n{result.content.replace('\n', '\n\n')}"""

# Log the result for tracking
utils.logger.debug(result.content)
utils.log_footer()

return report_section

yield FunctionInfo.from_fn(_arun, description=config.description)
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

general:
use_uvloop: true

functions:
hardware_check:
_type: hardware_check
llm_name: tool_reasoning_llm
host_performance_check:
_type: host_performance_check
llm_name: tool_reasoning_llm
monitoring_process_check:
_type: monitoring_process_check
llm_name: tool_reasoning_llm
network_connectivity_check:
_type: network_connectivity_check
llm_name: tool_reasoning_llm
telemetry_metrics_host_heartbeat_check:
_type: telemetry_metrics_host_heartbeat_check
llm_name: tool_reasoning_llm
telemetry_metrics_host_performance_check:
_type: telemetry_metrics_host_performance_check
llm_name: tool_reasoning_llm
telemetry_metrics_analysis_agent:
_type: telemetry_metrics_analysis_agent
tool_names:
- telemetry_metrics_host_heartbeat_check
- telemetry_metrics_host_performance_check
llm_name: telemetry_metrics_analysis_agent_llm
maintenance_check:
_type: maintenance_check
llm_name: maintenance_check_llm
categorizer:
_type: categorizer
llm_name: categorizer_llm

workflow:
_type: alert_triage_agent
tool_names:
- hardware_check
- host_performance_check
- monitoring_process_check
- network_connectivity_check
- telemetry_metrics_analysis_agent
llm_name: ata_agent_llm

llms:
ata_agent_llm:
_type: nim
model_name: meta/llama-3.3-70b-instruct
temperature: 0.2
max_tokens: 2048

tool_reasoning_llm:
_type: nim
model_name: meta/llama-3.3-70b-instruct
temperature: 0.2
top_p: 0.7
max_tokens: 2048

telemetry_metrics_analysis_agent_llm:
_type: nim
model_name: meta/llama-3.3-70b-instruct
temperature: 0
max_tokens: 2048

maintenance_check_llm:
_type: nim
model_name: meta/llama-3.3-70b-instruct
temperature: 0
max_tokens: 2048

categorizer_llm:
_type: nim
model_name: meta/llama-3.3-70b-instruct
temperature: 0
max_tokens: 2048
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import subprocess
from pydantic import Field
from aiq.builder.builder import Builder
from aiq.builder.function_info import FunctionInfo
from aiq.cli.register_workflow import register_function
from aiq.data_models.component_ref import LLMRef
from aiq.data_models.function import FunctionBaseConfig

from . import utils
from .prompts import ToolReasoningLayerPrompts
def _get_ipmi_monitor_data(ip_address, username, password):
"""
Capture IPMI monitoring data using the ipmimonitoring command.

NOTE: This is just an example implementation of hardware status checking using IPMI.
Users should implement their own hardware check commands specific to their environment
and infrastructure setup. The key is to return hardware health/status information in
a format that can be analyzed.

Args:
ip_address (str): The IP address of the device.
username (str): The user credential for ipmi monitoring.
password (str): The password credential for ipmi monitoring.

Returns:
str: The command's output if successful, otherwise None.
"""
# Construct the ipmimonitoring command with required parameters
command = ["ipmimonitoring", "-h", ip_address, "-u", username, "-p", password, "--privilege-level=USER"]

try:
# Execute the ipmimonitoring command and capture output
result = subprocess.run(command, capture_output=True, text=True, check=True)
return result.stdout

except subprocess.CalledProcessError as e:
# Log error and return None if command fails
utils.logger.error(f"Error executing IPMI monitoring command. Details: {e.stderr}")
return None


class HardwareCheckToolConfig(FunctionBaseConfig, name="hardware_check"):
description: str = Field(
default="This tool checks hardware health status using IPMI monitoring to detect power state, hardware degradation, and anomalies that could explain alerts. Args: host_id: str",
description="Description of the tool for the agent."
)
llm_name: LLMRef

@register_function(config_type=HardwareCheckToolConfig)
async def hardware_check_tool(config: HardwareCheckToolConfig, builder: Builder):
async def _arun(host_id: str) -> str:
is_test_mode = utils.is_test_mode()
utils.log_header("Hardware Status Checker")

try:
if not is_test_mode:
ip = "ipmi_ip" # Replace with your actual IPMI IP address
user = "ipmi_user" # Replace with your actual username
pwd = "ipmi_password" # Replace with your actual password
monitoring_data = _get_ipmi_monitor_data(ip, user, pwd)
else:
# In test mode, load test data from CSV file
df = utils.load_test_data()

# Get IPMI data from test data, falling back to static data if needed
monitoring_data = utils.load_column_or_static(
df=df,
host_id=host_id,
column="hardware_check_tool:ipmi_output",
)

if monitoring_data is not None:
# Additional LLM reasoning layer on playbook output to provide a summary of the results
utils.log_header("LLM Reasoning", dash_length=50)

prompt = ToolReasoningLayerPrompts.HARDWARE_CHECK.format(input_data=monitoring_data)

# Get analysis from LLM
conclusion = await utils.llm_ainvoke(config, builder, prompt)

utils.logger.debug(conclusion)
utils.log_footer()

return conclusion
else:
# Handle case where no IPMI data could be retrieved
utils.logger.debug("No hardware data available")
return "Hardware check failed: Unable to retrieve hardware monitoring data. This could indicate connectivity issues with the IPMI interface, invalid credentials, or that the IPMI service is not responding."

except Exception as e:
# Log and re-raise any errors that occur
utils.logger.error(f"Error during hardware check: {str(e)}")
raise e

yield FunctionInfo.from_fn(
_arun,
description=config.description,
)
Loading
Loading