Amit Arora ff5fdffd42
fix(02-use-cases): Add multi-region support for SRE-Agent (#246)
* Add multi-region support for SRE-Agent

- Add AWS region configuration parameter to agent_config.yaml
- Update gateway main.py to validate region matches endpoint URL
- Modify SRE agent to read region from config and pass through function chain
- Update memory client and LLM creation to use configurable region
- Fixes hardcoded us-east-1 region dependencies

Closes #245

* Move architecture file to docs/ and improve setup instructions

- Move sre_agent_architecture.md to docs/ folder for better organization
- Update graph export code to generate architecture file in docs/ folder
- Add automatic docs directory creation if it doesn't exist
- Improve README setup instructions:
  - Fix .env.example copy path to use sre_agent folder
  - Add note that Amazon Bedrock users don't need to modify .env
  - Add START_API_BACKEND variable to conditionally start backend servers
  - Useful for workshop environments where backends are already running

* Improve gateway configuration documentation and setup instructions

- Update config.yaml.example to use REGION placeholder instead of hardcoded us-east-1
- Add gateway configuration step to README setup instructions
- Document .cognito_config file in auth.md automated setup section
- Remove duplicate credential_provider_name from config.yaml.example
- Update configuration.md to include .cognito_config in files overview
- Add clear instructions to copy and edit gateway/config.yaml before creating gateway

* Improve IAM role guidance and region handling

- Add clear guidance about IAM role options in gateway/config.yaml.example
- Explain that testing can use current EC2/notebook role
- Recommend dedicated role for production deployments
- Add aws sts get-caller-identity command to help users find their role
- Update deployment scripts to use AWS_REGION env var as fallback
- Scripts now follow: CLI arg -> AWS_REGION env var -> us-east-1 default

* Remove unnecessary individual Cognito ID files

- Remove creation of .cognito_user_pool_id file
- Remove creation of .cognito_client_id file
- Keep only .cognito_config as the single source of truth
- Simplifies configuration management

* Implement region fallback logic for SRE Agent

- Added region fallback chain: agent_config.yaml -> AWS_REGION env -> us-east-1
- Modified agent_config.yaml to comment out region parameter to enable fallback
- Updated multi_agent_langgraph.py with comprehensive fallback implementation
- Added logging to show which region source is being used
- Ensures flexible region configuration without breaking existing deployments
- Maintains backward compatibility while adding multi-region support
2025-08-13 08:32:37 -04:00

307 lines
11 KiB
Python

#!/usr/bin/env python3
import argparse
import json
import logging
import os
import time
from pathlib import Path
import boto3
from botocore.exceptions import ClientError
from dotenv import load_dotenv
# Configuration constants
DELETION_WAIT_TIME = 150 # seconds to wait after runtime deletion before recreating
# Configure logging with basicConfig
logging.basicConfig(
level=logging.INFO,
# Define log message format
format="%(asctime)s,p%(process)s,{%(filename)s:%(lineno)d},%(levelname)s,%(message)s",
)
def _write_agent_arn_to_file(agent_arn: str, output_dir: str = None) -> None:
"""Write agent ARN to .agent_arn file."""
if output_dir is None:
output_dir = Path(__file__).parent
else:
output_dir = Path(output_dir)
arn_file = output_dir / ".agent_arn"
try:
with open(arn_file, "w") as f:
f.write(agent_arn)
logging.info(f"💾 Agent Runtime ARN saved to {arn_file}")
except Exception as e:
logging.error(f"Failed to write agent ARN to file: {e}")
def _get_agent_runtime_id_by_name(client: boto3.client, runtime_name: str) -> str:
"""Get agent runtime ID by name."""
try:
response = client.list_agent_runtimes()
agent_runtimes = response.get("agentRuntimes", [])
for runtime in agent_runtimes:
if runtime["agentRuntimeName"] == runtime_name:
return runtime["agentRuntimeId"]
return None
except ClientError as e:
logging.error(f"Failed to get agent runtime ID: {e}")
return None
def _delete_agent_runtime(client: boto3.client, runtime_id: str) -> bool:
"""Delete an agent runtime by ID."""
try:
logging.info(f"Deleting agent runtime with ID: {runtime_id}")
client.delete_agent_runtime(agentRuntimeId=runtime_id)
logging.info("Agent runtime deleted successfully")
return True
except ClientError as e:
logging.error(f"Failed to delete agent runtime: {e}")
return False
def _list_existing_agent_runtimes(client: boto3.client) -> None:
"""List all existing agent runtimes."""
try:
response = client.list_agent_runtimes()
agent_runtimes = response.get("agentRuntimes", [])
if not agent_runtimes:
logging.info("No existing agent runtimes found.")
return
logging.info("Existing agent runtimes:")
for runtime in agent_runtimes:
logging.info(json.dumps(runtime, indent=2, default=str))
except ClientError as e:
logging.error(f"Failed to list agent runtimes: {e}")
def _create_agent_runtime(
client: boto3.client,
runtime_name: str,
container_uri: str,
role_arn: str,
anthropic_api_key: str,
gateway_access_token: str,
llm_provider: str = "bedrock",
force_recreate: bool = False,
) -> None:
"""Create an agent runtime with error handling for conflicts."""
# Build environment variables
env_vars = {
"GATEWAY_ACCESS_TOKEN": gateway_access_token,
"LLM_PROVIDER": llm_provider,
}
# Only add ANTHROPIC_API_KEY if it exists
if anthropic_api_key:
env_vars["ANTHROPIC_API_KEY"] = anthropic_api_key
# Check for DEBUG environment variable
debug_mode = os.getenv("DEBUG", "false")
if debug_mode.lower() in ("true", "1", "yes"):
env_vars["DEBUG"] = "true"
logging.info("Debug mode enabled for agent runtime")
# Log environment variables being passed to AgentCore (mask sensitive data)
logging.info("🚀 Environment variables being passed to AgentCore Runtime:")
for key, value in env_vars.items():
if key in ["ANTHROPIC_API_KEY", "GATEWAY_ACCESS_TOKEN"]:
masked_value = f"{'*' * 20}...{value[-8:] if len(value) > 8 else '***'}"
logging.info(f" {key}: {masked_value}")
else:
logging.info(f" {key}: {value}")
try:
response = client.create_agent_runtime(
agentRuntimeName=runtime_name,
agentRuntimeArtifact={
"containerConfiguration": {"containerUri": container_uri}
},
networkConfiguration={"networkMode": "PUBLIC"},
roleArn=role_arn,
environmentVariables=env_vars,
)
logging.info("Agent Runtime created successfully!")
logging.info(f"Agent Runtime ARN: {response['agentRuntimeArn']}")
logging.info(f"Status: {response['status']}")
_write_agent_arn_to_file(response["agentRuntimeArn"])
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code", "")
# Handle non-conflict errors immediately
if error_code != "ConflictException":
logging.error(f"Failed to create agent runtime: {e}")
raise
# Handle conflict - runtime already exists
logging.error(f"Agent runtime '{runtime_name}' already exists.")
logging.info("Listing existing agent runtimes:")
_list_existing_agent_runtimes(client)
# If not forcing recreate, provide guidance and exit
if not force_recreate:
logging.info(
"Please retry with a new agent name using the --runtime-name parameter, or use --force-recreate to delete and recreate."
)
return
# Handle force recreate scenario
logging.info(
"Force recreate requested, attempting to delete existing runtime..."
)
runtime_id = _get_agent_runtime_id_by_name(client, runtime_name)
if not runtime_id:
logging.error(f"Could not find runtime ID for '{runtime_name}'")
return
if not _delete_agent_runtime(client, runtime_id):
logging.error("Failed to delete existing runtime")
return
# Wait for deletion to complete
logging.info(
f"Waiting {DELETION_WAIT_TIME} seconds for deletion to complete..."
)
time.sleep(DELETION_WAIT_TIME)
# Recreate the runtime after successful deletion
logging.info("Attempting to recreate agent runtime...")
try:
response = client.create_agent_runtime(
agentRuntimeName=runtime_name,
agentRuntimeArtifact={
"containerConfiguration": {"containerUri": container_uri}
},
networkConfiguration={"networkMode": "PUBLIC"},
roleArn=role_arn,
environmentVariables=env_vars,
)
except ClientError as e:
if e.response["Error"]["Code"] == "ConflictException":
logging.error("\n" + "=" * 70)
logging.error("⚠️ AGENT NAME CONFLICT - AWS CLEANUP STILL IN PROGRESS")
logging.error("=" * 70)
logging.error(
f"Even after waiting {DELETION_WAIT_TIME} seconds, the agent name"
)
logging.error(f"'{runtime_name}' is still not available.")
logging.error("")
logging.error(
"This is an AWS internal cleanup delay. Please try one of:"
)
logging.error("1. Wait 1-2 more minutes and run the script again")
logging.error("2. Use a different agent name (e.g., add a timestamp)")
logging.error(f" ./deployment/build_and_deploy.sh {runtime_name}_v2")
logging.error("=" * 70)
print(
"\n⚠️ Please wait 1-2 minutes for AWS to complete agent deletion,"
)
print(" then try running the deployment script again.")
raise
logging.info("Agent Runtime recreated successfully!")
logging.info(f"Agent Runtime ARN: {response['agentRuntimeArn']}")
logging.info(f"Status: {response['status']}")
_write_agent_arn_to_file(response["agentRuntimeArn"])
def main():
parser = argparse.ArgumentParser(
description="Deploy SRE Agent to AgentCore Runtime"
)
parser.add_argument(
"--runtime-name",
default="sre-agent",
help="Name for the agent runtime (default: sre-agent)",
)
parser.add_argument(
"--container-uri",
required=True,
help="Container URI (e.g., account-id.dkr.ecr.us-west-2.amazonaws.com/my-agent:latest)",
)
parser.add_argument(
"--role-arn", required=True, help="IAM role ARN for the agent runtime"
)
parser.add_argument(
"--region",
default=os.environ.get("AWS_REGION", "us-east-1"),
help="AWS region (default: AWS_REGION env var or us-east-1)"
)
parser.add_argument(
"--force-recreate",
action="store_true",
help="Delete existing runtime if it exists and recreate it",
)
args = parser.parse_args()
# Load environment variables from .env file
script_dir = Path(__file__).parent
env_file = script_dir / ".env"
if env_file.exists():
load_dotenv(env_file)
logging.info(f"Loaded environment variables from {env_file}")
else:
logging.error(f".env file not found at {env_file}")
raise FileNotFoundError(
f"Please create a .env file at {env_file} with GATEWAY_ACCESS_TOKEN and optionally ANTHROPIC_API_KEY"
)
# Get environment variables
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
gateway_access_token = os.getenv("GATEWAY_ACCESS_TOKEN")
llm_provider = os.getenv("LLM_PROVIDER", "bedrock")
# Log environment variable values (mask sensitive data)
logging.info("📋 Environment variables loaded:")
logging.info(f" LLM_PROVIDER: {llm_provider}")
if anthropic_api_key:
logging.info(
f" ANTHROPIC_API_KEY: {'*' * 20}...{anthropic_api_key[-8:] if len(anthropic_api_key) > 8 else '***'}"
)
else:
logging.info(
" ANTHROPIC_API_KEY: Not set - Amazon Bedrock will be used as the provider"
)
if gateway_access_token:
logging.info(
f" GATEWAY_ACCESS_TOKEN: {'*' * 20}...{gateway_access_token[-8:] if len(gateway_access_token) > 8 else '***'}"
)
if not gateway_access_token:
logging.error("GATEWAY_ACCESS_TOKEN not found in .env")
raise ValueError("GATEWAY_ACCESS_TOKEN must be set in .env")
client = boto3.client("bedrock-agentcore-control", region_name=args.region)
_create_agent_runtime(
client=client,
runtime_name=args.runtime_name,
container_uri=args.container_uri,
role_arn=args.role_arn,
anthropic_api_key=anthropic_api_key,
gateway_access_token=gateway_access_token,
llm_provider=llm_provider,
force_recreate=args.force_recreate,
)
if __name__ == "__main__":
main()