Amit Arora ff5fdffd42
fix(02-use-cases): Add multi-region support for SRE-Agent (#246)
* Add multi-region support for SRE-Agent

- Add AWS region configuration parameter to agent_config.yaml
- Update gateway main.py to validate region matches endpoint URL
- Modify SRE agent to read region from config and pass through function chain
- Update memory client and LLM creation to use configurable region
- Fixes hardcoded us-east-1 region dependencies

Closes #245

* Move architecture file to docs/ and improve setup instructions

- Move sre_agent_architecture.md to docs/ folder for better organization
- Update graph export code to generate architecture file in docs/ folder
- Add automatic docs directory creation if it doesn't exist
- Improve README setup instructions:
  - Fix .env.example copy path to use sre_agent folder
  - Add note that Amazon Bedrock users don't need to modify .env
  - Add START_API_BACKEND variable to conditionally start backend servers
  - Useful for workshop environments where backends are already running

* Improve gateway configuration documentation and setup instructions

- Update config.yaml.example to use REGION placeholder instead of hardcoded us-east-1
- Add gateway configuration step to README setup instructions
- Document .cognito_config file in auth.md automated setup section
- Remove duplicate credential_provider_name from config.yaml.example
- Update configuration.md to include .cognito_config in files overview
- Add clear instructions to copy and edit gateway/config.yaml before creating gateway

* Improve IAM role guidance and region handling

- Add clear guidance about IAM role options in gateway/config.yaml.example
- Explain that testing can use current EC2/notebook role
- Recommend dedicated role for production deployments
- Add aws sts get-caller-identity command to help users find their role
- Update deployment scripts to use AWS_REGION env var as fallback
- Scripts now follow: CLI arg -> AWS_REGION env var -> us-east-1 default

* Remove unnecessary individual Cognito ID files

- Remove creation of .cognito_user_pool_id file
- Remove creation of .cognito_client_id file
- Keep only .cognito_config as the single source of truth
- Simplifies configuration management

* Implement region fallback logic for SRE Agent

- Added region fallback chain: agent_config.yaml -> AWS_REGION env -> us-east-1
- Modified agent_config.yaml to comment out region parameter to enable fallback
- Updated multi_agent_langgraph.py with comprehensive fallback implementation
- Added logging to show which region source is being used
- Ensures flexible region configuration without breaking existing deployments
- Maintains backward compatibility while adding multi-region support
2025-08-13 08:32:37 -04:00

212 lines
6.9 KiB
Python

#!/usr/bin/env python3
import logging
from typing import Any, Dict, List, Literal
from langchain_core.messages import HumanMessage
from langchain_core.tools import BaseTool
from langgraph.graph import END, StateGraph
from .agent_nodes import (
create_kubernetes_agent,
create_logs_agent,
create_metrics_agent,
create_runbooks_agent,
)
from .agent_state import AgentState
from .constants import SREConstants
from .supervisor import SupervisorAgent
# Configure logging with basicConfig
logging.basicConfig(
level=logging.INFO, # Set the log level to INFO
# Define log message format
format="%(asctime)s,p%(process)s,{%(filename)s:%(lineno)d},%(levelname)s,%(message)s",
)
logger = logging.getLogger(__name__)
def _should_continue(state: AgentState) -> Literal["supervisor", "FINISH"]:
"""Determine if we should continue or finish."""
next_agent = state.get("next", "FINISH")
if next_agent == "FINISH":
return "FINISH"
# Check if we've already invoked this agent (avoid loops)
agents_invoked = state.get("agents_invoked", [])
if next_agent in agents_invoked and not state.get("requires_collaboration", False):
logger.warning(f"Agent {next_agent} already invoked, finishing to avoid loop")
return "FINISH"
return "supervisor"
def _route_supervisor(state: AgentState) -> str:
"""Route from supervisor to the appropriate agent or finish."""
next_agent = state.get("next", "FINISH")
if next_agent == "FINISH":
return "aggregate"
# Map to actual node names - handle both old short names and new full names
agent_map = {
"kubernetes": "kubernetes_agent",
"logs": "logs_agent",
"metrics": "metrics_agent",
"runbooks": "runbooks_agent",
# Also handle the new full names directly
"kubernetes_agent": "kubernetes_agent",
"logs_agent": "logs_agent",
"metrics_agent": "metrics_agent",
"runbooks_agent": "runbooks_agent",
}
return agent_map.get(next_agent, "aggregate")
async def _prepare_initial_state(state: AgentState) -> Dict[str, Any]:
"""Prepare the initial state with the user's query."""
messages = state.get("messages", [])
# Extract the current query from the last human message
current_query = ""
for msg in reversed(messages):
if isinstance(msg, HumanMessage):
current_query = msg.content
break
return {
"current_query": current_query,
"agent_results": {},
"agents_invoked": [],
"requires_collaboration": False,
"metadata": {},
}
def build_multi_agent_graph(
tools: List[BaseTool],
llm_provider: str = "bedrock",
force_delete_memory: bool = False,
export_graph: bool = False,
graph_output_path: str = "./docs/sre_agent_architecture.md",
**llm_kwargs,
) -> StateGraph:
"""Build the multi-agent collaboration graph.
Args:
tools: List of all available tools
llm_provider: LLM provider to use
force_delete_memory: Whether to force delete existing memory
export_graph: Whether to export the graph as a Mermaid diagram
graph_output_path: Path to save the exported Mermaid diagram (default: ./docs/sre_agent_architecture.md)
**llm_kwargs: Additional arguments for LLM
Returns:
Compiled StateGraph for multi-agent collaboration
"""
logger.info("Building multi-agent collaboration graph")
# Create the state graph
workflow = StateGraph(AgentState)
# Create supervisor
supervisor = SupervisorAgent(
llm_provider=llm_provider, force_delete_memory=force_delete_memory, **llm_kwargs
)
# Create agent nodes with filtered tools and metadata from constants
kubernetes_agent = create_kubernetes_agent(
tools,
agent_metadata=SREConstants.agents.agents["kubernetes"],
llm_provider=llm_provider,
**llm_kwargs,
)
logs_agent = create_logs_agent(
tools,
agent_metadata=SREConstants.agents.agents["logs"],
llm_provider=llm_provider,
**llm_kwargs,
)
metrics_agent = create_metrics_agent(
tools,
agent_metadata=SREConstants.agents.agents["metrics"],
llm_provider=llm_provider,
**llm_kwargs,
)
runbooks_agent = create_runbooks_agent(
tools,
agent_metadata=SREConstants.agents.agents["runbooks"],
llm_provider=llm_provider,
**llm_kwargs,
)
# Add nodes to the graph
workflow.add_node("prepare", _prepare_initial_state)
workflow.add_node("supervisor", supervisor.route)
workflow.add_node("kubernetes_agent", kubernetes_agent)
workflow.add_node("logs_agent", logs_agent)
workflow.add_node("metrics_agent", metrics_agent)
workflow.add_node("runbooks_agent", runbooks_agent)
workflow.add_node("aggregate", supervisor.aggregate_responses)
# Set entry point
workflow.set_entry_point("prepare")
# Add edges from prepare to supervisor
workflow.add_edge("prepare", "supervisor")
# Add conditional edges from supervisor
workflow.add_conditional_edges(
"supervisor",
_route_supervisor,
{
"kubernetes_agent": "kubernetes_agent",
"logs_agent": "logs_agent",
"metrics_agent": "metrics_agent",
"runbooks_agent": "runbooks_agent",
"aggregate": "aggregate",
},
)
# Add edges from agents back to supervisor
workflow.add_edge("kubernetes_agent", "supervisor")
workflow.add_edge("logs_agent", "supervisor")
workflow.add_edge("metrics_agent", "supervisor")
workflow.add_edge("runbooks_agent", "supervisor")
# Add edge from aggregate to END
workflow.add_edge("aggregate", END)
# Compile the graph
compiled_graph = workflow.compile()
# Export graph visualization if requested
if export_graph:
try:
# Create docs directory if it doesn't exist
from pathlib import Path
output_path = Path(graph_output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Get the Mermaid representation of the graph
mermaid_diagram = compiled_graph.get_graph().draw_mermaid()
# Save to file
with open(graph_output_path, "w") as f:
f.write("# SRE Agent Architecture\n\n")
f.write("```mermaid\n")
f.write(mermaid_diagram)
f.write("\n```\n")
logger.info(f"Graph architecture (Mermaid) exported to: {graph_output_path}")
print(f"✅ Graph architecture (Mermaid diagram) exported to: {graph_output_path}")
except Exception as e:
logger.error(f"Failed to export graph: {e}")
print(f"❌ Failed to export graph: {e}")
logger.info("Multi-agent collaboration graph built successfully")
return compiled_graph