2025-07-16 14:07:30 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import logging
|
|
|
|
from typing import Any, Dict, List, Literal
|
|
|
|
|
|
|
|
from langchain_core.messages import HumanMessage
|
|
|
|
from langchain_core.tools import BaseTool
|
|
|
|
from langgraph.graph import END, StateGraph
|
|
|
|
|
|
|
|
from .agent_nodes import (
|
|
|
|
create_kubernetes_agent,
|
|
|
|
create_logs_agent,
|
|
|
|
create_metrics_agent,
|
|
|
|
create_runbooks_agent,
|
|
|
|
)
|
|
|
|
from .agent_state import AgentState
|
|
|
|
from .supervisor import SupervisorAgent
|
|
|
|
|
|
|
|
# Configure logging with basicConfig
|
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.INFO, # Set the log level to INFO
|
|
|
|
# Define log message format
|
|
|
|
format="%(asctime)s,p%(process)s,{%(filename)s:%(lineno)d},%(levelname)s,%(message)s",
|
|
|
|
)
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def _should_continue(state: AgentState) -> Literal["supervisor", "FINISH"]:
|
|
|
|
"""Determine if we should continue or finish."""
|
|
|
|
next_agent = state.get("next", "FINISH")
|
|
|
|
|
|
|
|
if next_agent == "FINISH":
|
|
|
|
return "FINISH"
|
|
|
|
|
|
|
|
# Check if we've already invoked this agent (avoid loops)
|
|
|
|
agents_invoked = state.get("agents_invoked", [])
|
|
|
|
if next_agent in agents_invoked and not state.get("requires_collaboration", False):
|
|
|
|
logger.warning(f"Agent {next_agent} already invoked, finishing to avoid loop")
|
|
|
|
return "FINISH"
|
|
|
|
|
|
|
|
return "supervisor"
|
|
|
|
|
|
|
|
|
|
|
|
def _route_supervisor(state: AgentState) -> str:
|
|
|
|
"""Route from supervisor to the appropriate agent or finish."""
|
|
|
|
next_agent = state.get("next", "FINISH")
|
|
|
|
|
|
|
|
if next_agent == "FINISH":
|
|
|
|
return "aggregate"
|
|
|
|
|
|
|
|
# Map to actual node names
|
|
|
|
agent_map = {
|
|
|
|
"kubernetes": "kubernetes_agent",
|
|
|
|
"logs": "logs_agent",
|
|
|
|
"metrics": "metrics_agent",
|
|
|
|
"runbooks": "runbooks_agent",
|
|
|
|
}
|
|
|
|
|
|
|
|
return agent_map.get(next_agent, "aggregate")
|
|
|
|
|
|
|
|
|
|
|
|
async def _prepare_initial_state(state: AgentState) -> Dict[str, Any]:
|
|
|
|
"""Prepare the initial state with the user's query."""
|
|
|
|
messages = state.get("messages", [])
|
|
|
|
|
|
|
|
# Extract the current query from the last human message
|
|
|
|
current_query = ""
|
|
|
|
for msg in reversed(messages):
|
|
|
|
if isinstance(msg, HumanMessage):
|
|
|
|
current_query = msg.content
|
|
|
|
break
|
|
|
|
|
|
|
|
return {
|
|
|
|
"current_query": current_query,
|
|
|
|
"agent_results": {},
|
|
|
|
"agents_invoked": [],
|
|
|
|
"requires_collaboration": False,
|
|
|
|
"metadata": {},
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def build_multi_agent_graph(
|
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158)
* feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime
- Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility
- Create Dockerfile for ARM64-based containerization
- Add deployment scripts for automated ECR push and AgentCore deployment
- Update backend API URLs from placeholders to actual endpoints
- Update gateway configuration for production use
- Add dependencies for AgentCore runtime support
Implements #143
* chore: Add deployment artifacts to .gitignore
- Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore
- Remove already tracked deployment artifacts from git
* feat: Make ANTHROPIC_API_KEY optional in deployment
- Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY
- Show info message when using Amazon Bedrock as provider
- Update .env.example to clarify ANTHROPIC_API_KEY is optional
- Only include ANTHROPIC_API_KEY in environment variables if it exists
* fix: Use uv run python instead of python in build script
- Update build_and_deploy.sh to use 'uv run python' for deployment
- Change to parent directory to ensure uv environment is available
- Fixes 'python: command not found' error during deployment
* refactor: Improve deployment script structure and create .env symlink
- Flatten nested if-else blocks in deploy_agent_runtime.py for better readability
- Add 10-second sleep after deletion to ensure cleanup completes
- Create symlink from deployment/.env to sre_agent/.env to avoid duplication
- Move time import to top of file with other imports
* feat: Add debug mode support and comprehensive deployment guide
Add --debug command line flag and DEBUG environment variable support:
- Created shared logging configuration module
- Updated CLI and runtime to support --debug flag
- Made debug traces conditional on DEBUG environment variable
- Added debug mode for container and AgentCore deployments
Enhanced build and deployment script:
- Added command line argument for ECR repository name
- Added help documentation and usage examples
- Added support for local builds (x86_64) vs AgentCore builds (arm64)
- Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY
Created comprehensive deployment guide:
- Step-by-step instructions from local testing to production
- Docker platform documentation (x86_64 vs arm64)
- Environment variable configuration with .env file usage
- Debug mode examples and troubleshooting guide
- Provider configuration for Bedrock and Anthropic
Updated README with AgentCore Runtime deployment section and documentation links.
* docs: Update SRE Agent README with deployment flow diagram and fix directory reference
- Fix reference from 04-SRE-agent to SRE-agent in README
- Add comprehensive flowchart showing development to production deployment flow
- Update overview to mention Amazon Bedrock AgentCore Runtime deployment
- Remove emojis from documentation for professional appearance
* docs: Replace mermaid diagram with ASCII step-by-step flow diagram
- Change from block-style mermaid diagram to ASCII flow diagram
- Show clear step-by-step progression from development to production
- Improve readability with structured boxes and arrows
- Minor text improvements for clarity
* feat: Implement comprehensive prompt management system and enhance deployment guide
- Create centralized prompt template system with external files in config/prompts/
- Add PromptLoader utility class with LRU caching and template variable substitution
- Integrate PromptConfig into SREConstants for centralized configuration management
- Update all agents (nodes, supervisor, output_formatter) to use prompt loader
- Replace 150+ lines of hardcoded prompts with modular, maintainable template system
- Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout
- Add quick-start copy-paste command sequence for streamlined deployment
- Improve constants system with comprehensive model, AWS, timeout, and prompt configs
- Add architectural assessment document to .gitignore for local analysis
- Run black formatting across all updated Python files
* docs: Consolidate deployment and security documentation
- Rename deployment-and-security.md to security.md and remove redundant deployment content
- Enhance security.md with comprehensive production security guidelines including:
- Authentication and authorization best practices
- Encryption and data protection requirements
- Operational security monitoring and logging
- Input validation and prompt security measures
- Infrastructure security recommendations
- Compliance and governance frameworks
- Update README.md to reference new security.md file
- Eliminate redundancy between deployment-guide.md and deployment-and-security.md
- Improve documentation organization with clear separation of concerns
* config: Replace hardcoded endpoints with placeholder domains
- Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com'
- k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011
- logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012
- metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013
- runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014
- Update agent configuration to use placeholder AgentCore gateway endpoint
- agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint'
- Improve security by removing hardcoded production endpoints from repository
- Enable template-based configuration that users can customize during setup
- Align with existing documentation patterns for placeholder domain replacement
2025-07-27 15:05:03 -04:00
|
|
|
tools: List[BaseTool], llm_provider: str = "bedrock", **llm_kwargs
|
2025-07-16 14:07:30 -04:00
|
|
|
) -> StateGraph:
|
|
|
|
"""Build the multi-agent collaboration graph.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
tools: List of all available tools
|
|
|
|
llm_provider: LLM provider to use
|
|
|
|
**llm_kwargs: Additional arguments for LLM
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Compiled StateGraph for multi-agent collaboration
|
|
|
|
"""
|
|
|
|
logger.info("Building multi-agent collaboration graph")
|
|
|
|
|
|
|
|
# Create the state graph
|
|
|
|
workflow = StateGraph(AgentState)
|
|
|
|
|
|
|
|
# Create supervisor
|
|
|
|
supervisor = SupervisorAgent(llm_provider=llm_provider, **llm_kwargs)
|
|
|
|
|
|
|
|
# Create agent nodes with filtered tools
|
|
|
|
kubernetes_agent = create_kubernetes_agent(
|
|
|
|
tools, llm_provider=llm_provider, **llm_kwargs
|
|
|
|
)
|
|
|
|
logs_agent = create_logs_agent(tools, llm_provider=llm_provider, **llm_kwargs)
|
|
|
|
metrics_agent = create_metrics_agent(tools, llm_provider=llm_provider, **llm_kwargs)
|
|
|
|
runbooks_agent = create_runbooks_agent(
|
|
|
|
tools, llm_provider=llm_provider, **llm_kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
# Add nodes to the graph
|
|
|
|
workflow.add_node("prepare", _prepare_initial_state)
|
|
|
|
workflow.add_node("supervisor", supervisor.route)
|
|
|
|
workflow.add_node("kubernetes_agent", kubernetes_agent)
|
|
|
|
workflow.add_node("logs_agent", logs_agent)
|
|
|
|
workflow.add_node("metrics_agent", metrics_agent)
|
|
|
|
workflow.add_node("runbooks_agent", runbooks_agent)
|
|
|
|
workflow.add_node("aggregate", supervisor.aggregate_responses)
|
|
|
|
|
|
|
|
# Set entry point
|
|
|
|
workflow.set_entry_point("prepare")
|
|
|
|
|
|
|
|
# Add edges from prepare to supervisor
|
|
|
|
workflow.add_edge("prepare", "supervisor")
|
|
|
|
|
|
|
|
# Add conditional edges from supervisor
|
|
|
|
workflow.add_conditional_edges(
|
|
|
|
"supervisor",
|
|
|
|
_route_supervisor,
|
|
|
|
{
|
|
|
|
"kubernetes_agent": "kubernetes_agent",
|
|
|
|
"logs_agent": "logs_agent",
|
|
|
|
"metrics_agent": "metrics_agent",
|
|
|
|
"runbooks_agent": "runbooks_agent",
|
|
|
|
"aggregate": "aggregate",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
# Add edges from agents back to supervisor
|
|
|
|
workflow.add_edge("kubernetes_agent", "supervisor")
|
|
|
|
workflow.add_edge("logs_agent", "supervisor")
|
|
|
|
workflow.add_edge("metrics_agent", "supervisor")
|
|
|
|
workflow.add_edge("runbooks_agent", "supervisor")
|
|
|
|
|
|
|
|
# Add edge from aggregate to END
|
|
|
|
workflow.add_edge("aggregate", END)
|
|
|
|
|
|
|
|
# Compile the graph
|
|
|
|
compiled_graph = workflow.compile()
|
|
|
|
|
|
|
|
logger.info("Multi-agent collaboration graph built successfully")
|
|
|
|
return compiled_graph
|