Dheeraj Oruganty e346e83bf1
fix(02-use-cases): SRE-Agent Deployment (#179)
* Add missing credential_provider_name parameter to config.yaml.example

* Fix get_config function to properly parse YAML values with inline comments

* Enhanced get_config to prevent copy-paste whitespace errors in AWS identifiers

* Improve LLM provider configuration and error handling with bedrock as default

* Add OpenAPI templating system and fix hardcoded regions

* Add backend template build to Readme

* delete old yaml files

* Fix Cognito setup with automation script and missing domain creation steps

* docs: Add EC2 instance port configuration documentation

- Document required inbound ports (443, 8011-8014)
- Include SSL/TLS security requirements
- Add AWS security group best practices
- Provide port usage summary table

* docs: Add hyperlinks to prerequisites in README

- Link EC2 port configuration documentation
- Link IAM role authentication setup
- Improve navigation to detailed setup instructions

* docs: Add BACKEND_API_KEY to configuration documentation

- Document gateway environment variables section
- Add BACKEND_API_KEY requirement for credential provider
- Include example .env file format for gateway directory
- Explain usage in create_gateway.sh script

* docs: Add BACKEND_API_KEY to deployment guide environment variables

- Include BACKEND_API_KEY in environment variables reference table
- Mark as required for gateway setup
- Provide quick reference alongside other required variables

* docs: Add BedrockAgentCoreFullAccess policy and trust policy documentation

- Document AWS managed policy BedrockAgentCoreFullAccess
- Add trust policy requirements for bedrock-agentcore.amazonaws.com
- Reorganize IAM permissions for better clarity
- Remove duplicate trust policy section
- Add IAM role requirement to deployment prerequisites

* docs: Document role_name field in gateway config example

- Explain that role_name is used to create and manage the gateway
- Specify BedrockAgentCoreFullAccess policy requirement
- Note trust policy requirement for bedrock-agentcore.amazonaws.com
- Improve clarity for gateway configuration setup

* docs: Add AWS IP address ranges for production security enhancement

- Document AWS IP ranges JSON download for restricting access
- Reference official AWS documentation for IP address ranges
- Provide security alternatives to 0.0.0.0/0 for production
- Include examples of restricted security group configurations
- Enable egress filtering and region-specific access control

* style: Format Python code with black

- Reformat 14 Python files for consistent code style
- Apply PEP 8 formatting standards
- Improve code readability and maintainability

* docs: Update SRE agent prerequisites and setup documentation

- Convert prerequisites section to markdown table format
- Add SSL certificate provider examples (no-ip.com, letsencrypt.org)
- Add Identity Provider (IDP) requirement with setup_cognito.sh reference
- Clarify that all prerequisites must be completed before setup
- Add reference to domain name and cert paths needed for BACKEND_DOMAIN
- Remove Managing OpenAPI Specifications section (covered in use-case setup)
- Add Deployment Guide link to Development to Production section

Addresses issues #171 and #174

* fix: Replace 'AWS Bedrock' with 'Amazon Bedrock' in SRE agent files

- Updated error messages in llm_utils.py
- Updated comments in both .env.example files
- Ensures consistent naming convention across SRE agent codebase

---------

Co-authored-by: dheerajoruganty <dheo@amazon.com>
Co-authored-by: Amit Arora <aroraai@amazon.com>
2025-08-01 13:24:58 -04:00

276 lines
9.2 KiB
Python

#!/usr/bin/env python3
import argparse
import boto3
import json
import logging
import os
import time
from pathlib import Path
from dotenv import load_dotenv
from botocore.exceptions import ClientError
# Configure logging with basicConfig
logging.basicConfig(
level=logging.INFO,
# Define log message format
format="%(asctime)s,p%(process)s,{%(filename)s:%(lineno)d},%(levelname)s,%(message)s",
)
def _write_agent_arn_to_file(agent_arn: str, output_dir: str = None) -> None:
"""Write agent ARN to .agent_arn file."""
if output_dir is None:
output_dir = Path(__file__).parent
else:
output_dir = Path(output_dir)
arn_file = output_dir / ".agent_arn"
try:
with open(arn_file, "w") as f:
f.write(agent_arn)
logging.info(f"💾 Agent Runtime ARN saved to {arn_file}")
except Exception as e:
logging.error(f"Failed to write agent ARN to file: {e}")
def _get_agent_runtime_id_by_name(client: boto3.client, runtime_name: str) -> str:
"""Get agent runtime ID by name."""
try:
response = client.list_agent_runtimes()
agent_runtimes = response.get("agentRuntimes", [])
for runtime in agent_runtimes:
if runtime["agentRuntimeName"] == runtime_name:
return runtime["agentRuntimeId"]
return None
except ClientError as e:
logging.error(f"Failed to get agent runtime ID: {e}")
return None
def _delete_agent_runtime(client: boto3.client, runtime_id: str) -> bool:
"""Delete an agent runtime by ID."""
try:
logging.info(f"Deleting agent runtime with ID: {runtime_id}")
client.delete_agent_runtime(agentRuntimeId=runtime_id)
logging.info("Agent runtime deleted successfully")
return True
except ClientError as e:
logging.error(f"Failed to delete agent runtime: {e}")
return False
def _list_existing_agent_runtimes(client: boto3.client) -> None:
"""List all existing agent runtimes."""
try:
response = client.list_agent_runtimes()
agent_runtimes = response.get("agentRuntimes", [])
if not agent_runtimes:
logging.info("No existing agent runtimes found.")
return
logging.info("Existing agent runtimes:")
for runtime in agent_runtimes:
logging.info(json.dumps(runtime, indent=2, default=str))
except ClientError as e:
logging.error(f"Failed to list agent runtimes: {e}")
def _create_agent_runtime(
client: boto3.client,
runtime_name: str,
container_uri: str,
role_arn: str,
anthropic_api_key: str,
gateway_access_token: str,
llm_provider: str = "bedrock",
force_recreate: bool = False,
) -> None:
"""Create an agent runtime with error handling for conflicts."""
# Build environment variables
env_vars = {
"GATEWAY_ACCESS_TOKEN": gateway_access_token,
"LLM_PROVIDER": llm_provider,
}
# Only add ANTHROPIC_API_KEY if it exists
if anthropic_api_key:
env_vars["ANTHROPIC_API_KEY"] = anthropic_api_key
# Check for DEBUG environment variable
debug_mode = os.getenv("DEBUG", "false")
if debug_mode.lower() in ("true", "1", "yes"):
env_vars["DEBUG"] = "true"
logging.info("Debug mode enabled for agent runtime")
# Log environment variables being passed to AgentCore (mask sensitive data)
logging.info("🚀 Environment variables being passed to AgentCore Runtime:")
for key, value in env_vars.items():
if key in ["ANTHROPIC_API_KEY", "GATEWAY_ACCESS_TOKEN"]:
masked_value = f"{'*' * 20}...{value[-8:] if len(value) > 8 else '***'}"
logging.info(f" {key}: {masked_value}")
else:
logging.info(f" {key}: {value}")
try:
response = client.create_agent_runtime(
agentRuntimeName=runtime_name,
agentRuntimeArtifact={
"containerConfiguration": {"containerUri": container_uri}
},
networkConfiguration={"networkMode": "PUBLIC"},
roleArn=role_arn,
environmentVariables=env_vars,
)
logging.info(f"Agent Runtime created successfully!")
logging.info(f"Agent Runtime ARN: {response['agentRuntimeArn']}")
logging.info(f"Status: {response['status']}")
_write_agent_arn_to_file(response["agentRuntimeArn"])
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code", "")
# Handle non-conflict errors immediately
if error_code != "ConflictException":
logging.error(f"Failed to create agent runtime: {e}")
raise
# Handle conflict - runtime already exists
logging.error(f"Agent runtime '{runtime_name}' already exists.")
logging.info("Listing existing agent runtimes:")
_list_existing_agent_runtimes(client)
# If not forcing recreate, provide guidance and exit
if not force_recreate:
logging.info(
"Please retry with a new agent name using the --runtime-name parameter, or use --force-recreate to delete and recreate."
)
return
# Handle force recreate scenario
logging.info(
"Force recreate requested, attempting to delete existing runtime..."
)
runtime_id = _get_agent_runtime_id_by_name(client, runtime_name)
if not runtime_id:
logging.error(f"Could not find runtime ID for '{runtime_name}'")
return
if not _delete_agent_runtime(client, runtime_id):
logging.error("Failed to delete existing runtime")
return
# Wait for deletion to complete
logging.info("Waiting 10 seconds for deletion to complete...")
time.sleep(10)
# Recreate the runtime after successful deletion
logging.info("Attempting to recreate agent runtime...")
response = client.create_agent_runtime(
agentRuntimeName=runtime_name,
agentRuntimeArtifact={
"containerConfiguration": {"containerUri": container_uri}
},
networkConfiguration={"networkMode": "PUBLIC"},
roleArn=role_arn,
environmentVariables=env_vars,
)
logging.info(f"Agent Runtime recreated successfully!")
logging.info(f"Agent Runtime ARN: {response['agentRuntimeArn']}")
logging.info(f"Status: {response['status']}")
_write_agent_arn_to_file(response["agentRuntimeArn"])
def main():
parser = argparse.ArgumentParser(
description="Deploy SRE Agent to AgentCore Runtime"
)
parser.add_argument(
"--runtime-name",
default="sre-agent",
help="Name for the agent runtime (default: sre-agent)",
)
parser.add_argument(
"--container-uri",
required=True,
help="Container URI (e.g., account-id.dkr.ecr.us-west-2.amazonaws.com/my-agent:latest)",
)
parser.add_argument(
"--role-arn", required=True, help="IAM role ARN for the agent runtime"
)
parser.add_argument(
"--region", default="us-east-1", help="AWS region (default: us-east-1)"
)
parser.add_argument(
"--force-recreate",
action="store_true",
help="Delete existing runtime if it exists and recreate it",
)
args = parser.parse_args()
# Load environment variables from .env file
script_dir = Path(__file__).parent
env_file = script_dir / ".env"
if env_file.exists():
load_dotenv(env_file)
logging.info(f"Loaded environment variables from {env_file}")
else:
logging.error(f".env file not found at {env_file}")
raise FileNotFoundError(
f"Please create a .env file at {env_file} with GATEWAY_ACCESS_TOKEN and optionally ANTHROPIC_API_KEY"
)
# Get environment variables
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
gateway_access_token = os.getenv("GATEWAY_ACCESS_TOKEN")
llm_provider = os.getenv("LLM_PROVIDER", "bedrock")
# Log environment variable values (mask sensitive data)
logging.info("📋 Environment variables loaded:")
logging.info(f" LLM_PROVIDER: {llm_provider}")
if anthropic_api_key:
logging.info(
f" ANTHROPIC_API_KEY: {'*' * 20}...{anthropic_api_key[-8:] if len(anthropic_api_key) > 8 else '***'}"
)
else:
logging.info(
" ANTHROPIC_API_KEY: Not set - Amazon Bedrock will be used as the provider"
)
if gateway_access_token:
logging.info(
f" GATEWAY_ACCESS_TOKEN: {'*' * 20}...{gateway_access_token[-8:] if len(gateway_access_token) > 8 else '***'}"
)
if not gateway_access_token:
logging.error("GATEWAY_ACCESS_TOKEN not found in .env")
raise ValueError("GATEWAY_ACCESS_TOKEN must be set in .env")
client = boto3.client("bedrock-agentcore-control", region_name=args.region)
_create_agent_runtime(
client=client,
runtime_name=args.runtime_name,
container_uri=args.container_uri,
role_arn=args.role_arn,
anthropic_api_key=anthropic_api_key,
gateway_access_token=gateway_access_token,
llm_provider=llm_provider,
force_recreate=args.force_recreate,
)
if __name__ == "__main__":
main()