amazon-bedrock-agentcore-sa.../02-use-cases/SRE-agent/deployment/build_and_deploy.sh

#!/bin/bash

# Exit on error
set -e

# Show usage if --help is passed
if [[ "$1" == "--help" ]] || [[ "$1" == "-h" ]]; then
    echo "Usage: $0 [ECR_REPO_NAME]"
    echo ""
    echo "Arguments:"
    echo "  ECR_REPO_NAME    Name for the ECR repository (default: sre_agent)"
    echo ""
    echo "Environment Variables:"
    echo "  LOCAL_BUILD      Set to 'true' for local container build without ECR push"
    echo "  PLATFORM         Set to 'x86_64' to build for local testing (default: arm64 for AgentCore)"
    echo "  DEBUG            Set to 'true' to enable debug mode in deployed agent"
    echo "  LLM_PROVIDER     Set to 'anthropic' or 'bedrock' (default: bedrock)"
    echo "  ANTHROPIC_API_KEY Required when using anthropic provider"
    echo ""
    echo "Examples:"
    echo "  # Deploy with default repo name"
    echo "  ./build_and_deploy.sh"
    echo ""
    echo "  # Deploy with custom repo name"
    echo "  ./build_and_deploy.sh my_custom_sre_agent"
    echo ""
    echo "  # Local build for testing"
    echo "  LOCAL_BUILD=true ./build_and_deploy.sh"
    echo ""
    echo "  # Deploy with debug and anthropic provider"
    echo "  DEBUG=true LLM_PROVIDER=anthropic ./build_and_deploy.sh my_sre_agent"
    exit 0
fi

# Get the directory where this script is located
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PARENT_DIR="$(dirname "$SCRIPT_DIR")"

# Configuration
AWS_REGION="${AWS_REGION:-us-east-1}"
ECR_REPO_NAME="${1:-sre_agent}"
RUNTIME_NAME="${RUNTIME_NAME:-$ECR_REPO_NAME}"
AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
ECR_REPO_URI="$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPO_NAME"

# Platform configuration (default to ARM64 for AgentCore)
PLATFORM="${PLATFORM:-arm64}"
LOCAL_BUILD="${LOCAL_BUILD:-false}"

# Get current caller identity and construct role ARN
CALLER_IDENTITY=$(aws sts get-caller-identity --output json)
CURRENT_ARN=$(echo $CALLER_IDENTITY | jq -r '.Arn')

# Extract role name from ARN and construct role ARN
# This handles both assumed-role and user scenarios
if [[ $CURRENT_ARN == *":assumed-role/"* ]]; then
    # Extract role name from assumed role ARN
    ROLE_NAME=$(echo $CURRENT_ARN | sed 's/.*:assumed-role\/\([^\/]*\).*/\1/')
    ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/${ROLE_NAME}"
else
    # Default role if not running with an assumed role
    ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/BedrockAgentCoreRole"
    echo "⚠️  Not running with an assumed role. Will use default role: $ROLE_ARN"
fi

# Allow override via environment variable
ROLE_ARN="${AGENT_ROLE_ARN:-$ROLE_ARN}"

echo "🔐 Logging in to Amazon ECR..."
aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com"

# Create repository if it doesn't exist
echo "📦 Creating ECR repository if it doesn't exist..."
aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$AWS_REGION" || \
    aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$AWS_REGION"

# Determine which Dockerfile to use and set up build environment
if [ "$PLATFORM" = "x86_64" ] || [ "$LOCAL_BUILD" = "true" ]; then
    echo "🏗️ Building Docker image for linux/amd64 (x86_64)..."
    DOCKERFILE="$PARENT_DIR/Dockerfile.x86_64"
    # Force platform to linux/amd64 for x86_64 builds
    docker build --platform linux/amd64 -f "$DOCKERFILE" -t "$ECR_REPO_NAME" "$PARENT_DIR"
else
    # Set up QEMU for ARM64 emulation
    echo "🔧 Setting up QEMU for ARM64 emulation..."
    docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
    
    # Build the Docker image for ARM64 (Dockerfile is at root level)
    echo "🏗️ Building Docker image for linux/arm64 (this may take longer due to emulation)..."
    DOCKERFILE="$PARENT_DIR/Dockerfile"
    # Explicitly set platform for ARM64
    DOCKER_BUILDKIT=0 docker build --platform linux/arm64 -f "$DOCKERFILE" -t "$ECR_REPO_NAME" "$PARENT_DIR"
fi

# For local builds, skip ECR push and deployment
if [ "$LOCAL_BUILD" = "true" ]; then
    echo "✅ Successfully built local image: $ECR_REPO_NAME:latest"
    echo ""
    echo "📝 To run the container locally:"
    echo "docker run -p 8080:8080 --env-file $PARENT_DIR/sre_agent/.env $ECR_REPO_NAME:latest"
    echo ""
    echo "Or with AWS credentials (bedrock provider - default):"
    echo "docker run -p 8080:8080 -v ~/.aws:/root/.aws:ro -e AWS_PROFILE=default -e GATEWAY_ACCESS_TOKEN=\$GATEWAY_ACCESS_TOKEN $ECR_REPO_NAME:latest"
    echo ""
    echo "Or with Anthropic provider:"
    echo "docker run -p 8080:8080 -e LLM_PROVIDER=anthropic -e ANTHROPIC_API_KEY=\$ANTHROPIC_API_KEY -e GATEWAY_ACCESS_TOKEN=\$GATEWAY_ACCESS_TOKEN $ECR_REPO_NAME:latest"
    exit 0
fi

# Tag the image
echo "🏷️ Tagging image..."
docker tag "$ECR_REPO_NAME":latest "$ECR_REPO_URI":latest

# Push the image to ECR
echo "⬆️ Pushing image to ECR..."
docker push "$ECR_REPO_URI":latest

echo "✅ Successfully built and pushed image to:"
echo "$ECR_REPO_URI:latest"

# Save the container URI to a file in the script directory
echo "💾 Saving container URI to .sre_agent_uri file..."
echo "$ECR_REPO_URI:latest" > "$SCRIPT_DIR/.sre_agent_uri"
echo "Container URI saved to $SCRIPT_DIR/.sre_agent_uri"

# Deploy the agent runtime
echo ""
echo "🚀 Deploying agent runtime..."
echo "Using role ARN: $ROLE_ARN"
echo "Using runtime name: $RUNTIME_NAME"
echo "Using region: $AWS_REGION"

# Check if .env file exists
if [ ! -f "$SCRIPT_DIR/.env" ]; then
    echo "❌ Error: .env file not found at $SCRIPT_DIR/.env"
    echo "Please create a .env file with ANTHROPIC_API_KEY and GATEWAY_ACCESS_TOKEN"
    echo "You can use .env.example as a template"
    exit 1
fi

echo ".env file found at $SCRIPT_DIR/.env"

# Deploy using the Python script
cd "$SCRIPT_DIR"

# Create a temporary file to capture output
TEMP_OUTPUT=$(mktemp)

# Log environment variables being passed
echo "🔧 Environment variables for deployment:"
echo "   DEBUG: ${DEBUG:-not set}"
echo "   LLM_PROVIDER: ${LLM_PROVIDER:-bedrock (default)}"
if [ -n "$ANTHROPIC_API_KEY" ]; then
    echo "   ANTHROPIC_API_KEY: ***...${ANTHROPIC_API_KEY: -8}"
else
    echo "   ANTHROPIC_API_KEY: not set"
fi

# Change to parent directory to use uv
cd "$PARENT_DIR"

# Run the Python script and capture both return code and output
# Pass through DEBUG, LLM_PROVIDER, and ANTHROPIC_API_KEY environment variables
if DEBUG="$DEBUG" LLM_PROVIDER="${LLM_PROVIDER:-bedrock}" ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" uv run python deployment/deploy_agent_runtime.py \
    --container-uri "$ECR_REPO_URI:latest" \
    --role-arn "$ROLE_ARN" \
    --runtime-name "$RUNTIME_NAME" \
    --region "$AWS_REGION" \
    --force-recreate > "$TEMP_OUTPUT" 2>&1; then
    
    # Success - show output
    DEPLOY_OUTPUT=$(cat "$TEMP_OUTPUT")
    echo "$DEPLOY_OUTPUT"
else
    # Failure - show error output and exit
    echo "❌ Agent runtime deployment failed!"
    echo "Error output:"
    cat "$TEMP_OUTPUT"
    rm -f "$TEMP_OUTPUT"
    exit 1
fi

# Clean up temporary file
rm -f "$TEMP_OUTPUT"

echo ""
echo "🎉 Build and deployment complete!"
fix(SRE Agent)- Deploy SRE Agent on Amazon Bedrock AgentCore Runtime with Enhanced Architecture (#158) * feat: Deploy SRE agent on Amazon Bedrock AgentCore Runtime - Add agent_runtime.py with FastAPI endpoints for AgentCore compatibility - Create Dockerfile for ARM64-based containerization - Add deployment scripts for automated ECR push and AgentCore deployment - Update backend API URLs from placeholders to actual endpoints - Update gateway configuration for production use - Add dependencies for AgentCore runtime support Implements #143 * chore: Add deployment artifacts to .gitignore - Add deployment/.sre_agent_uri, deployment/.env, and deployment/.agent_arn to .gitignore - Remove already tracked deployment artifacts from git * feat: Make ANTHROPIC_API_KEY optional in deployment - Update deploy_agent_runtime.py to conditionally include ANTHROPIC_API_KEY - Show info message when using Amazon Bedrock as provider - Update .env.example to clarify ANTHROPIC_API_KEY is optional - Only include ANTHROPIC_API_KEY in environment variables if it exists * fix: Use uv run python instead of python in build script - Update build_and_deploy.sh to use 'uv run python' for deployment - Change to parent directory to ensure uv environment is available - Fixes 'python: command not found' error during deployment * refactor: Improve deployment script structure and create .env symlink - Flatten nested if-else blocks in deploy_agent_runtime.py for better readability - Add 10-second sleep after deletion to ensure cleanup completes - Create symlink from deployment/.env to sre_agent/.env to avoid duplication - Move time import to top of file with other imports * feat: Add debug mode support and comprehensive deployment guide Add --debug command line flag and DEBUG environment variable support: - Created shared logging configuration module - Updated CLI and runtime to support --debug flag - Made debug traces conditional on DEBUG environment variable - Added debug mode for container and AgentCore deployments Enhanced build and deployment script: - Added command line argument for ECR repository name - Added help documentation and usage examples - Added support for local builds (x86_64) vs AgentCore builds (arm64) - Added environment variable pass-through for DEBUG, LLM_PROVIDER, ANTHROPIC_API_KEY Created comprehensive deployment guide: - Step-by-step instructions from local testing to production - Docker platform documentation (x86_64 vs arm64) - Environment variable configuration with .env file usage - Debug mode examples and troubleshooting guide - Provider configuration for Bedrock and Anthropic Updated README with AgentCore Runtime deployment section and documentation links. * docs: Update SRE Agent README with deployment flow diagram and fix directory reference - Fix reference from 04-SRE-agent to SRE-agent in README - Add comprehensive flowchart showing development to production deployment flow - Update overview to mention Amazon Bedrock AgentCore Runtime deployment - Remove emojis from documentation for professional appearance * docs: Replace mermaid diagram with ASCII step-by-step flow diagram - Change from block-style mermaid diagram to ASCII flow diagram - Show clear step-by-step progression from development to production - Improve readability with structured boxes and arrows - Minor text improvements for clarity * feat: Implement comprehensive prompt management system and enhance deployment guide - Create centralized prompt template system with external files in config/prompts/ - Add PromptLoader utility class with LRU caching and template variable substitution - Integrate PromptConfig into SREConstants for centralized configuration management - Update all agents (nodes, supervisor, output_formatter) to use prompt loader - Replace 150+ lines of hardcoded prompts with modular, maintainable template system - Enhance deployment guide with consistent naming (my_custom_sre_agent) throughout - Add quick-start copy-paste command sequence for streamlined deployment - Improve constants system with comprehensive model, AWS, timeout, and prompt configs - Add architectural assessment document to .gitignore for local analysis - Run black formatting across all updated Python files * docs: Consolidate deployment and security documentation - Rename deployment-and-security.md to security.md and remove redundant deployment content - Enhance security.md with comprehensive production security guidelines including: - Authentication and authorization best practices - Encryption and data protection requirements - Operational security monitoring and logging - Input validation and prompt security measures - Infrastructure security recommendations - Compliance and governance frameworks - Update README.md to reference new security.md file - Eliminate redundancy between deployment-guide.md and deployment-and-security.md - Improve documentation organization with clear separation of concerns * config: Replace hardcoded endpoints with placeholder domains - Update OpenAPI specifications to use placeholder domain 'your-backend-domain.com' - k8s_api.yaml: mcpgateway.ddns.net:8011 -> your-backend-domain.com:8011 - logs_api.yaml: mcpgateway.ddns.net:8012 -> your-backend-domain.com:8012 - metrics_api.yaml: mcpgateway.ddns.net:8013 -> your-backend-domain.com:8013 - runbooks_api.yaml: mcpgateway.ddns.net:8014 -> your-backend-domain.com:8014 - Update agent configuration to use placeholder AgentCore gateway endpoint - agent_config.yaml: Replace specific gateway ID with 'your-agentcore-gateway-endpoint' - Improve security by removing hardcoded production endpoints from repository - Enable template-based configuration that users can customize during setup - Align with existing documentation patterns for placeholder domain replacement 2025-07-27 15:05:03 -04:00			`#!/bin/bash`

			`# Exit on error`
			`set -e`

			`# Show usage if --help is passed`
			`if [[ "$1" == "--help" ]] \|\| [[ "$1" == "-h" ]]; then`
			`echo "Usage: $0 [ECR_REPO_NAME]"`
			`echo ""`
			`echo "Arguments:"`
			`echo " ECR_REPO_NAME Name for the ECR repository (default: sre_agent)"`
			`echo ""`
			`echo "Environment Variables:"`
			`echo " LOCAL_BUILD Set to 'true' for local container build without ECR push"`
			`echo " PLATFORM Set to 'x86_64' to build for local testing (default: arm64 for AgentCore)"`
			`echo " DEBUG Set to 'true' to enable debug mode in deployed agent"`
			`echo " LLM_PROVIDER Set to 'anthropic' or 'bedrock' (default: bedrock)"`
			`echo " ANTHROPIC_API_KEY Required when using anthropic provider"`
			`echo ""`
			`echo "Examples:"`
			`echo " # Deploy with default repo name"`
			`echo " ./build_and_deploy.sh"`
			`echo ""`
			`echo " # Deploy with custom repo name"`
			`echo " ./build_and_deploy.sh my_custom_sre_agent"`
			`echo ""`
			`echo " # Local build for testing"`
			`echo " LOCAL_BUILD=true ./build_and_deploy.sh"`
			`echo ""`
			`echo " # Deploy with debug and anthropic provider"`
			`echo " DEBUG=true LLM_PROVIDER=anthropic ./build_and_deploy.sh my_sre_agent"`
			`exit 0`
			`fi`

			`# Get the directory where this script is located`
			`SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"`
			`PARENT_DIR="$(dirname "$SCRIPT_DIR")"`

			`# Configuration`
			`AWS_REGION="${AWS_REGION:-us-east-1}"`
			`ECR_REPO_NAME="${1:-sre_agent}"`
			`RUNTIME_NAME="${RUNTIME_NAME:-$ECR_REPO_NAME}"`
			`AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)`
			`ECR_REPO_URI="$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPO_NAME"`

			`# Platform configuration (default to ARM64 for AgentCore)`
			`PLATFORM="${PLATFORM:-arm64}"`
			`LOCAL_BUILD="${LOCAL_BUILD:-false}"`

			`# Get current caller identity and construct role ARN`
			`CALLER_IDENTITY=$(aws sts get-caller-identity --output json)`
			`CURRENT_ARN=$(echo $CALLER_IDENTITY \| jq -r '.Arn')`

			`# Extract role name from ARN and construct role ARN`
			`# This handles both assumed-role and user scenarios`
			`if [[ $CURRENT_ARN == ":assumed-role/" ]]; then`
			`# Extract role name from assumed role ARN`
			`ROLE_NAME=$(echo $CURRENT_ARN \| sed 's/.:assumed-role\/\([^\/]\).*/\1/')`
			`ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/${ROLE_NAME}"`
			`else`
			`# Default role if not running with an assumed role`
			`ROLE_ARN="arn:aws:iam::${AWS_ACCOUNT_ID}:role/BedrockAgentCoreRole"`
			`echo "⚠️ Not running with an assumed role. Will use default role: $ROLE_ARN"`
			`fi`

			`# Allow override via environment variable`
			`ROLE_ARN="${AGENT_ROLE_ARN:-$ROLE_ARN}"`

			`echo "🔐 Logging in to Amazon ECR..."`
			`aws ecr get-login-password --region $AWS_REGION \| docker login --username AWS --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com"`

			`# Create repository if it doesn't exist`
			`echo "📦 Creating ECR repository if it doesn't exist..."`
			`aws ecr describe-repositories --repository-names "$ECR_REPO_NAME" --region "$AWS_REGION" \|\| \`
			`aws ecr create-repository --repository-name "$ECR_REPO_NAME" --region "$AWS_REGION"`

			`# Determine which Dockerfile to use and set up build environment`
			`if [ "$PLATFORM" = "x86_64" ] \|\| [ "$LOCAL_BUILD" = "true" ]; then`
			`echo "🏗️ Building Docker image for linux/amd64 (x86_64)..."`
			`DOCKERFILE="$PARENT_DIR/Dockerfile.x86_64"`
			`# Force platform to linux/amd64 for x86_64 builds`
			`docker build --platform linux/amd64 -f "$DOCKERFILE" -t "$ECR_REPO_NAME" "$PARENT_DIR"`
			`else`
			`# Set up QEMU for ARM64 emulation`
			`echo "🔧 Setting up QEMU for ARM64 emulation..."`
			`docker run --rm --privileged multiarch/qemu-user-static --reset -p yes`

			`# Build the Docker image for ARM64 (Dockerfile is at root level)`
			`echo "🏗️ Building Docker image for linux/arm64 (this may take longer due to emulation)..."`
			`DOCKERFILE="$PARENT_DIR/Dockerfile"`
			`# Explicitly set platform for ARM64`
			`DOCKER_BUILDKIT=0 docker build --platform linux/arm64 -f "$DOCKERFILE" -t "$ECR_REPO_NAME" "$PARENT_DIR"`
			`fi`

			`# For local builds, skip ECR push and deployment`
			`if [ "$LOCAL_BUILD" = "true" ]; then`
			`echo "✅ Successfully built local image: $ECR_REPO_NAME:latest"`
			`echo ""`
			`echo "📝 To run the container locally:"`
			`echo "docker run -p 8080:8080 --env-file $PARENT_DIR/sre_agent/.env $ECR_REPO_NAME:latest"`
			`echo ""`
			`echo "Or with AWS credentials (bedrock provider - default):"`
			`echo "docker run -p 8080:8080 -v ~/.aws:/root/.aws:ro -e AWS_PROFILE=default -e GATEWAY_ACCESS_TOKEN=\$GATEWAY_ACCESS_TOKEN $ECR_REPO_NAME:latest"`
			`echo ""`
			`echo "Or with Anthropic provider:"`
			`echo "docker run -p 8080:8080 -e LLM_PROVIDER=anthropic -e ANTHROPIC_API_KEY=\$ANTHROPIC_API_KEY -e GATEWAY_ACCESS_TOKEN=\$GATEWAY_ACCESS_TOKEN $ECR_REPO_NAME:latest"`
			`exit 0`
			`fi`

			`# Tag the image`
			`echo "🏷️ Tagging image..."`
			`docker tag "$ECR_REPO_NAME":latest "$ECR_REPO_URI":latest`

			`# Push the image to ECR`
			`echo "⬆️ Pushing image to ECR..."`
			`docker push "$ECR_REPO_URI":latest`

			`echo "✅ Successfully built and pushed image to:"`
			`echo "$ECR_REPO_URI:latest"`

			`# Save the container URI to a file in the script directory`
			`echo "💾 Saving container URI to .sre_agent_uri file..."`
			`echo "$ECR_REPO_URI:latest" > "$SCRIPT_DIR/.sre_agent_uri"`
			`echo "Container URI saved to $SCRIPT_DIR/.sre_agent_uri"`

			`# Deploy the agent runtime`
			`echo ""`
			`echo "🚀 Deploying agent runtime..."`
			`echo "Using role ARN: $ROLE_ARN"`
			`echo "Using runtime name: $RUNTIME_NAME"`
			`echo "Using region: $AWS_REGION"`

			`# Check if .env file exists`
			`if [ ! -f "$SCRIPT_DIR/.env" ]; then`
			`echo "❌ Error: .env file not found at $SCRIPT_DIR/.env"`
			`echo "Please create a .env file with ANTHROPIC_API_KEY and GATEWAY_ACCESS_TOKEN"`
			`echo "You can use .env.example as a template"`
			`exit 1`
			`fi`

			`echo ".env file found at $SCRIPT_DIR/.env"`

			`# Deploy using the Python script`
			`cd "$SCRIPT_DIR"`

			`# Create a temporary file to capture output`
			`TEMP_OUTPUT=$(mktemp)`

			`# Log environment variables being passed`
			`echo "🔧 Environment variables for deployment:"`
			`echo " DEBUG: ${DEBUG:-not set}"`
			`echo " LLM_PROVIDER: ${LLM_PROVIDER:-bedrock (default)}"`
			`if [ -n "$ANTHROPIC_API_KEY" ]; then`
			`echo " ANTHROPIC_API_KEY: ***...${ANTHROPIC_API_KEY: -8}"`
			`else`
			`echo " ANTHROPIC_API_KEY: not set"`
			`fi`

			`# Change to parent directory to use uv`
			`cd "$PARENT_DIR"`

			`# Run the Python script and capture both return code and output`
			`# Pass through DEBUG, LLM_PROVIDER, and ANTHROPIC_API_KEY environment variables`
			`if DEBUG="$DEBUG" LLM_PROVIDER="${LLM_PROVIDER:-bedrock}" ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" uv run python deployment/deploy_agent_runtime.py \`
			`--container-uri "$ECR_REPO_URI:latest" \`
			`--role-arn "$ROLE_ARN" \`
			`--runtime-name "$RUNTIME_NAME" \`
			`--region "$AWS_REGION" \`
			`--force-recreate > "$TEMP_OUTPUT" 2>&1; then`

			`# Success - show output`
			`DEPLOY_OUTPUT=$(cat "$TEMP_OUTPUT")`
			`echo "$DEPLOY_OUTPUT"`
			`else`
			`# Failure - show error output and exit`
			`echo "❌ Agent runtime deployment failed!"`
			`echo "Error output:"`
			`cat "$TEMP_OUTPUT"`
			`rm -f "$TEMP_OUTPUT"`
			`exit 1`
			`fi`

			`# Clean up temporary file`
			`rm -f "$TEMP_OUTPUT"`

			`echo ""`
			`echo "🎉 Build and deployment complete!"`