NVIDIA · rapids-bot · May 5, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
@@ -3,3 +3,4 @@ examples/simple_calculator/src/aiq_simple_calculator/data/** filter=lfs diff=lfs
 examples/simple/src/aiq_simple/data/** filter=lfs diff=lfs merge=lfs -text
 examples/swe_bench/src/aiq_swe_bench/data/** filter=lfs diff=lfs merge=lfs -text
 docs/source/_static/*.png filter=lfs diff=lfs merge=lfs -text
+examples/alert_triage_agent/src/aiq_alert_triage_agent/data/** filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example .env file
+# Required for accessing NVIDIA AI services
+NVIDIA_API_KEY=""
+
+# Set to "true" to run agent in test mode using synthetic data instead of live systems
+TEST_MODE=true
+
+# All file paths below should be relative to register.py location
+
+# Path to CSV containing scheduled maintenance window data
+MAINTENANCE_STATIC_DATA_PATH="data/maintenance_static_dataset.csv"
+
+# Main test dataset in CSV format containing alerts and their simulated environments
+TEST_DATA_RELATIVE_FILEPATH="data/test_data.csv"
+
+# JSON file with baseline/normal system behavior data
+TEST_BENIGN_DATA_RELATIVE_FILEPATH="data/benign_fallback_test_data.json"
+
+# Path where test results will be saved as a CSV file
+TEST_OUTPUT_RELATIVE_FILEPATH="output/test_output.csv"
@@ -0,0 +1,27 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools >= 64", "setuptools-scm>=8"]
+
+[tool.setuptools_scm]
+root = "../.."
+
+[project]
+name = "aiq_alert_triage_agent"
+dynamic = ["version"]
+dependencies = [
+  "aiqtoolkit[langchain]",
+  "langchain-core, # version determined by aiqtoolkit[langchain]
+  "pandas>=2.0.0",
+  "ansible-runner>=2.3.0",
+  "langgraph>=0.0.10", # version determined by aiqtoolkit[langchain]
+  "flask>=3.0.0"
+]
+requires-python = ">=3.11,<3.13"
+description = "Alert Triage AIQ Toolkit example"
+classifiers = ["Programming Language :: Python"]
+
+[tool.uv.sources]
+aiq = { path = "../..", editable = true }
+
+[project.entry-points.'aiq.components']
+aiq_alert_triage_agent = "aiq_alert_triage_agent.register"
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain_core.messages import HumanMessage
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.prompts import MessagesPlaceholder
+from pydantic import Field
+
+from aiq.builder.builder import Builder
+from aiq.builder.framework_enum import LLMFrameworkEnum
+from aiq.builder.function_info import FunctionInfo
+from aiq.cli.register_workflow import register_function
+from aiq.data_models.component_ref import LLMRef
+from aiq.data_models.function import FunctionBaseConfig
+
+from . import utils
+from .prompts import PipelineNodePrompts
+
+
+class CategorizerToolConfig(FunctionBaseConfig, name="categorizer"):
+    description: str = Field(
+        default="This is a categorization tool used at the end of the pipeline.",
+        description="Description of the tool."
+    )
+    llm_name: LLMRef
+
+
+@register_function(config_type=CategorizerToolConfig)
+async def categorizer_tool(config: CategorizerToolConfig, builder: Builder):
+    # Set up LLM and chain
+    llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
+    prompt_template = ChatPromptTemplate([("system", PipelineNodePrompts.CATEGORIZER_PROMPT), MessagesPlaceholder("msgs")])
+    categorization_chain = prompt_template | llm
+
+    async def _arun(report: str) -> str:
+        tool_name = "Root Cause Categorizer"
+        utils.log_header(tool_name)
+
+        result = await categorization_chain.ainvoke({"msgs": [HumanMessage(content=report)]})
+
+        # Extract the markdown heading level from first line of report (e.g. '#' or '##')
+        pound_signs = report.split('\n')[0].split(' ')[0]
+
+        # Format the root cause category section:
+        # - Add newlines before and after section
+        # - Use extracted heading level for consistency
+        # - Add extra newline between category and reasoning for readability
+        report_section = f"""\n\n{pound_signs} Root Cause Category\n{result.content.replace('\n', '\n\n')}"""
+
+        # Log the result for tracking
+        utils.logger.debug(result.content)
+        utils.log_footer()
+
+        return report_section
+
+    yield FunctionInfo.from_fn(_arun, description=config.description)
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+general:
+  use_uvloop: true
+
+functions:
+  hardware_check:
+    _type: hardware_check
+    llm_name: tool_reasoning_llm
+  host_performance_check:
+    _type: host_performance_check
+    llm_name: tool_reasoning_llm
+  monitoring_process_check:
+    _type: monitoring_process_check
+    llm_name: tool_reasoning_llm
+  network_connectivity_check:
+    _type: network_connectivity_check
+    llm_name: tool_reasoning_llm
+  telemetry_metrics_host_heartbeat_check:
+    _type: telemetry_metrics_host_heartbeat_check
+    llm_name: tool_reasoning_llm
+  telemetry_metrics_host_performance_check:
+    _type: telemetry_metrics_host_performance_check
+    llm_name: tool_reasoning_llm
+  telemetry_metrics_analysis_agent:
+    _type: telemetry_metrics_analysis_agent
+    tool_names:
+      - telemetry_metrics_host_heartbeat_check
+      - telemetry_metrics_host_performance_check
+    llm_name: telemetry_metrics_analysis_agent_llm
+  maintenance_check:
+    _type: maintenance_check
+    llm_name: maintenance_check_llm
+  categorizer:
+    _type: categorizer
+    llm_name: categorizer_llm
+
+workflow:
+  _type: alert_triage_agent
+  tool_names:
+    - hardware_check
+    - host_performance_check
+    - monitoring_process_check
+    - network_connectivity_check
+    - telemetry_metrics_analysis_agent
+  llm_name: ata_agent_llm
+
+llms:
+  ata_agent_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0.2
+    max_tokens: 2048
+
+  tool_reasoning_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0.2
+    top_p: 0.7
+    max_tokens: 2048
+
+  telemetry_metrics_analysis_agent_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 2048
+
+  maintenance_check_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 2048
+
+  categorizer_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 2048
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+from pydantic import Field
+from aiq.builder.builder import Builder
+from aiq.builder.function_info import FunctionInfo
+from aiq.cli.register_workflow import register_function
+from aiq.data_models.component_ref import LLMRef
+from aiq.data_models.function import FunctionBaseConfig
+
+from . import utils
+from .prompts import ToolReasoningLayerPrompts
+def _get_ipmi_monitor_data(ip_address, username, password):
+    """
+    Capture IPMI monitoring data using the ipmimonitoring command.
+
+    NOTE: This is just an example implementation of hardware status checking using IPMI.
+    Users should implement their own hardware check commands specific to their environment
+    and infrastructure setup. The key is to return hardware health/status information in
+    a format that can be analyzed.
+
+    Args:
+        ip_address (str): The IP address of the device.
+        username (str): The user credential for ipmi monitoring.
+        password (str): The password credential for ipmi monitoring.
+
+    Returns:
+        str: The command's output if successful, otherwise None.
+    """
+    # Construct the ipmimonitoring command with required parameters
+    command = ["ipmimonitoring", "-h", ip_address, "-u", username, "-p", password, "--privilege-level=USER"]
+
+    try:
+        # Execute the ipmimonitoring command and capture output
+        result = subprocess.run(command, capture_output=True, text=True, check=True)
+        return result.stdout
+
+    except subprocess.CalledProcessError as e:
+        # Log error and return None if command fails
+        utils.logger.error(f"Error executing IPMI monitoring command. Details: {e.stderr}")
+        return None
+
+
+class HardwareCheckToolConfig(FunctionBaseConfig, name="hardware_check"):
+    description: str = Field(
+        default="This tool checks hardware health status using IPMI monitoring to detect power state, hardware degradation, and anomalies that could explain alerts. Args: host_id: str",
+        description="Description of the tool for the agent."
+    )
+    llm_name: LLMRef
+
+@register_function(config_type=HardwareCheckToolConfig)
+async def hardware_check_tool(config: HardwareCheckToolConfig, builder: Builder):
+    async def _arun(host_id: str) -> str:
+        is_test_mode = utils.is_test_mode()
+        utils.log_header("Hardware Status Checker")
+
+        try:
+            if not is_test_mode:
+                ip = "ipmi_ip"  # Replace with your actual IPMI IP address
+                user = "ipmi_user"  # Replace with your actual username
+                pwd = "ipmi_password"  # Replace with your actual password
+                monitoring_data = _get_ipmi_monitor_data(ip, user, pwd)
+            else:
+                # In test mode, load test data from CSV file
+                df = utils.load_test_data()
+
+                # Get IPMI data from test data, falling back to static data if needed
+                monitoring_data = utils.load_column_or_static(
+                    df=df,
+                    host_id=host_id,
+                    column="hardware_check_tool:ipmi_output",
+                )
+
+            if monitoring_data is not None:
+                # Additional LLM reasoning layer on playbook output to provide a summary of the results
+                utils.log_header("LLM Reasoning", dash_length=50)
+
+                prompt = ToolReasoningLayerPrompts.HARDWARE_CHECK.format(input_data=monitoring_data)
+
+                # Get analysis from LLM
+                conclusion = await utils.llm_ainvoke(config, builder, prompt)
+
+                utils.logger.debug(conclusion)
+                utils.log_footer()
+
+                return conclusion
+            else:
+                # Handle case where no IPMI data could be retrieved
+                utils.logger.debug("No hardware data available")
+                return "Hardware check failed: Unable to retrieve hardware monitoring data. This could indicate connectivity issues with the IPMI interface, invalid credentials, or that the IPMI service is not responding."
+
+        except Exception as e:
+            # Log and re-raise any errors that occur
+            utils.logger.error(f"Error during hardware check: {str(e)}")
+            raise e
+
+    yield FunctionInfo.from_fn(
+        _arun,
+        description=config.description,
+    )