Dheeraj Oruganty e346e83bf1
fix(02-use-cases): SRE-Agent Deployment (#179)
* Add missing credential_provider_name parameter to config.yaml.example

* Fix get_config function to properly parse YAML values with inline comments

* Enhanced get_config to prevent copy-paste whitespace errors in AWS identifiers

* Improve LLM provider configuration and error handling with bedrock as default

* Add OpenAPI templating system and fix hardcoded regions

* Add backend template build to Readme

* delete old yaml files

* Fix Cognito setup with automation script and missing domain creation steps

* docs: Add EC2 instance port configuration documentation

- Document required inbound ports (443, 8011-8014)
- Include SSL/TLS security requirements
- Add AWS security group best practices
- Provide port usage summary table

* docs: Add hyperlinks to prerequisites in README

- Link EC2 port configuration documentation
- Link IAM role authentication setup
- Improve navigation to detailed setup instructions

* docs: Add BACKEND_API_KEY to configuration documentation

- Document gateway environment variables section
- Add BACKEND_API_KEY requirement for credential provider
- Include example .env file format for gateway directory
- Explain usage in create_gateway.sh script

* docs: Add BACKEND_API_KEY to deployment guide environment variables

- Include BACKEND_API_KEY in environment variables reference table
- Mark as required for gateway setup
- Provide quick reference alongside other required variables

* docs: Add BedrockAgentCoreFullAccess policy and trust policy documentation

- Document AWS managed policy BedrockAgentCoreFullAccess
- Add trust policy requirements for bedrock-agentcore.amazonaws.com
- Reorganize IAM permissions for better clarity
- Remove duplicate trust policy section
- Add IAM role requirement to deployment prerequisites

* docs: Document role_name field in gateway config example

- Explain that role_name is used to create and manage the gateway
- Specify BedrockAgentCoreFullAccess policy requirement
- Note trust policy requirement for bedrock-agentcore.amazonaws.com
- Improve clarity for gateway configuration setup

* docs: Add AWS IP address ranges for production security enhancement

- Document AWS IP ranges JSON download for restricting access
- Reference official AWS documentation for IP address ranges
- Provide security alternatives to 0.0.0.0/0 for production
- Include examples of restricted security group configurations
- Enable egress filtering and region-specific access control

* style: Format Python code with black

- Reformat 14 Python files for consistent code style
- Apply PEP 8 formatting standards
- Improve code readability and maintainability

* docs: Update SRE agent prerequisites and setup documentation

- Convert prerequisites section to markdown table format
- Add SSL certificate provider examples (no-ip.com, letsencrypt.org)
- Add Identity Provider (IDP) requirement with setup_cognito.sh reference
- Clarify that all prerequisites must be completed before setup
- Add reference to domain name and cert paths needed for BACKEND_DOMAIN
- Remove Managing OpenAPI Specifications section (covered in use-case setup)
- Add Deployment Guide link to Development to Production section

Addresses issues #171 and #174

* fix: Replace 'AWS Bedrock' with 'Amazon Bedrock' in SRE agent files

- Updated error messages in llm_utils.py
- Updated comments in both .env.example files
- Ensures consistent naming convention across SRE agent codebase

---------

Co-authored-by: dheerajoruganty <dheo@amazon.com>
Co-authored-by: Amit Arora <aroraai@amazon.com>
2025-08-01 13:24:58 -04:00

292 lines
9.5 KiB
Python

#!/usr/bin/env python3
import asyncio
import logging
import os
from datetime import datetime, timezone
from typing import Any, Dict
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from langchain_core.messages import HumanMessage
from langchain_core.tools import BaseTool
from .multi_agent_langgraph import create_multi_agent_system
from .agent_state import AgentState
from .constants import SREConstants
# Import logging config
from .logging_config import configure_logging
# Configure logging based on DEBUG environment variable
# This ensures debug mode works even when not run via __main__
if not logging.getLogger().handlers:
# Check if DEBUG is already set in environment
debug_from_env = os.getenv("DEBUG", "false").lower() in ("true", "1", "yes")
configure_logging(debug_from_env)
# Disable uvicorn access logs for /ping endpoint
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
# Simple FastAPI app
app = FastAPI(title="SRE Agent Runtime", version="1.0.0")
# Simple request/response models
class InvocationRequest(BaseModel):
input: Dict[str, Any]
class InvocationResponse(BaseModel):
output: Dict[str, Any]
# Global variables for agent state
agent_graph = None
tools: list[BaseTool] = []
async def initialize_agent():
"""Initialize the SRE agent system using the same method as CLI."""
global agent_graph, tools
if agent_graph is not None:
return # Already initialized
try:
logger.info("Initializing SRE Agent system...")
# Get provider from environment variable with bedrock as default
provider = os.getenv("LLM_PROVIDER", "bedrock").lower()
# Validate provider
if provider not in ["anthropic", "bedrock"]:
logger.warning(f"Invalid provider '{provider}', defaulting to 'bedrock'")
provider = "bedrock"
logger.info(f"Environment LLM_PROVIDER: {os.getenv('LLM_PROVIDER', 'NOT_SET')}")
logger.info(f"Using LLM provider: {provider}")
logger.info(f"Calling create_multi_agent_system with provider: {provider}")
# Create multi-agent system using the same function as CLI
agent_graph, tools = await create_multi_agent_system(provider)
logger.info(
f"SRE Agent system initialized successfully with {len(tools)} tools"
)
except Exception as e:
from .llm_utils import LLMAuthenticationError, LLMAccessError, LLMProviderError
if isinstance(e, (LLMAuthenticationError, LLMAccessError, LLMProviderError)):
logger.error(f"LLM Provider Error: {e}")
print(f"\n{type(e).__name__}:")
print(str(e))
print(f"\n💡 Set LLM_PROVIDER environment variable to switch providers:")
other_provider = "anthropic" if provider == "bedrock" else "bedrock"
print(f" export LLM_PROVIDER={other_provider}")
else:
logger.error(f"Failed to initialize SRE Agent system: {e}")
raise
@app.on_event("startup")
async def startup_event():
"""Initialize agent on startup."""
await initialize_agent()
@app.post("/invocations", response_model=InvocationResponse)
async def invoke_agent(request: InvocationRequest):
"""Main agent invocation endpoint."""
global agent_graph, tools
logger.info("Received invocation request")
try:
# Ensure agent is initialized
await initialize_agent()
# Extract user prompt
user_prompt = request.input.get("prompt", "")
if not user_prompt:
raise HTTPException(
status_code=400,
detail="No prompt found in input. Please provide a 'prompt' key in the input.",
)
logger.info(f"Processing query: {user_prompt}")
# Create initial state exactly like the CLI does
initial_state: AgentState = {
"messages": [HumanMessage(content=user_prompt)],
"next": "supervisor",
"agent_results": {},
"current_query": user_prompt,
"metadata": {},
"requires_collaboration": False,
"agents_invoked": [],
"final_response": None,
"auto_approve_plan": True, # Always auto-approve plans in runtime mode
}
# Process through the agent graph exactly like the CLI
final_response = ""
logger.info("Starting agent graph execution")
async for event in agent_graph.astream(initial_state):
for node_name, node_output in event.items():
logger.info(f"Processing node: {node_name}")
# Log key events from each node
if node_name == "supervisor":
next_agent = node_output.get("next", "")
metadata = node_output.get("metadata", {})
logger.info(f"Supervisor routing to: {next_agent}")
if metadata.get("routing_reasoning"):
logger.info(
f"Routing reasoning: {metadata['routing_reasoning']}"
)
elif node_name in [
"kubernetes_agent",
"logs_agent",
"metrics_agent",
"runbooks_agent",
]:
agent_results = node_output.get("agent_results", {})
logger.info(f"{node_name} completed with results")
# Capture final response from aggregate node
elif node_name == "aggregate":
final_response = node_output.get("final_response", "")
logger.info("Aggregate node completed, final response captured")
if not final_response:
logger.warning("No final response received from agent graph")
final_response = (
"I encountered an issue processing your request. Please try again."
)
else:
logger.info(f"Final response length: {len(final_response)} characters")
# Simple response format
response_data = {
"message": final_response,
"timestamp": datetime.now(timezone.utc).isoformat(),
"model": SREConstants.app.agent_model_name,
}
logger.info("Successfully processed agent request")
logger.info("Returning invocation response")
return InvocationResponse(output=response_data)
except HTTPException:
raise
except Exception as e:
logger.error(f"Agent processing failed: {e}")
logger.exception("Full exception details:")
raise HTTPException(
status_code=500, detail=f"Agent processing failed: {str(e)}"
)
@app.get("/ping")
async def ping():
"""Health check endpoint."""
return {"status": "healthy"}
async def invoke_sre_agent_async(prompt: str, provider: str = "anthropic") -> str:
"""
Programmatic interface to invoke SRE agent.
Args:
prompt: The user prompt/query
provider: LLM provider ("anthropic" or "bedrock")
Returns:
The agent's response as a string
"""
try:
# Create the multi-agent system
graph, tools = await create_multi_agent_system(provider=provider)
# Create initial state
initial_state: AgentState = {
"messages": [HumanMessage(content=prompt)],
"next": "supervisor",
"agent_results": {},
"current_query": prompt,
"metadata": {},
"requires_collaboration": False,
"agents_invoked": [],
"final_response": None,
}
# Execute and get final response
final_response = ""
async for event in graph.astream(initial_state):
for node_name, node_output in event.items():
if node_name == "aggregate":
final_response = node_output.get("final_response", "")
return final_response or "I encountered an issue processing your request."
except Exception as e:
logger.error(f"Agent invocation failed: {e}")
raise
def invoke_sre_agent(prompt: str, provider: str = "anthropic") -> str:
"""
Synchronous wrapper for invoke_sre_agent_async.
Args:
prompt: The user prompt/query
provider: LLM provider ("anthropic" or "bedrock")
Returns:
The agent's response as a string
"""
return asyncio.run(invoke_sre_agent_async(prompt, provider))
if __name__ == "__main__":
import argparse
import uvicorn
parser = argparse.ArgumentParser(description="SRE Agent Runtime")
parser.add_argument(
"--provider",
choices=["anthropic", "bedrock"],
default=os.getenv("LLM_PROVIDER", "bedrock"),
help="LLM provider to use (default: bedrock)",
)
parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
parser.add_argument("--port", type=int, default=8080, help="Port to bind to")
parser.add_argument(
"--debug",
action="store_true",
help="Enable debug logging and trace output",
)
args = parser.parse_args()
# Configure logging based on debug flag
from .logging_config import configure_logging
debug_enabled = configure_logging(args.debug)
# Set environment variables
os.environ["LLM_PROVIDER"] = args.provider
os.environ["DEBUG"] = "true" if debug_enabled else "false"
logger.info(f"Starting SRE Agent Runtime with provider: {args.provider}")
if debug_enabled:
logger.info("Debug logging enabled")
uvicorn.run(app, host=args.host, port=args.port)