amazon-bedrock-agentcore-sa.../02-use-cases/SRE-agent/sre_agent/llm_utils.py

#!/usr/bin/env python3
"""
Centralized LLM utilities with improved error handling.

This module provides a single point for LLM creation with proper error handling
for authentication, access, and configuration issues.
"""

import logging
from typing import Optional, Dict, Any

from langchain_anthropic import ChatAnthropic
from langchain_aws import ChatBedrock

from .constants import SREConstants

logger = logging.getLogger(__name__)


class LLMProviderError(Exception):
    """Exception raised when LLM provider creation fails."""

    pass


class LLMAuthenticationError(LLMProviderError):
    """Exception raised when LLM authentication fails."""

    pass


class LLMAccessError(LLMProviderError):
    """Exception raised when LLM access is denied."""

    pass


def create_llm_with_error_handling(provider: str = "bedrock", **kwargs):
    """Create LLM instance with proper error handling and helpful error messages.

    Args:
        provider: LLM provider ("anthropic" or "bedrock")
        **kwargs: Additional configuration overrides

    Returns:
        LLM instance

    Raises:
        LLMProviderError: For general provider errors
        LLMAuthenticationError: For authentication failures
        LLMAccessError: For access/permission failures
        ValueError: For unsupported providers
    """
    if provider not in ["anthropic", "bedrock"]:
        raise ValueError(
            f"Unsupported provider: {provider}. Use 'anthropic' or 'bedrock'"
        )

    logger.info(f"Creating LLM with provider: {provider}")

    try:
        config = SREConstants.get_model_config(provider, **kwargs)

        if provider == "anthropic":
            logger.info(f"Creating Anthropic LLM - Model: {config['model_id']}")
            return _create_anthropic_llm(config)
        else:  # bedrock
            logger.info(
                f"Creating Bedrock LLM - Model: {config['model_id']}, Region: {config['region_name']}"
            )
            return _create_bedrock_llm(config)

    except Exception as e:
        error_msg = _get_helpful_error_message(provider, e)
        logger.error(f"Failed to create LLM: {error_msg}")

        # Classify the error type for better handling
        if _is_auth_error(e):
            raise LLMAuthenticationError(error_msg) from e
        elif _is_access_error(e):
            raise LLMAccessError(error_msg) from e
        else:
            raise LLMProviderError(error_msg) from e


def _create_anthropic_llm(config: Dict[str, Any]):
    """Create Anthropic LLM instance."""
    return ChatAnthropic(
        model=config["model_id"],
        max_tokens=config["max_tokens"],
        temperature=config["temperature"],
    )


def _create_bedrock_llm(config: Dict[str, Any]):
    """Create Bedrock LLM instance."""
    return ChatBedrock(
        model_id=config["model_id"],
        region_name=config["region_name"],
        model_kwargs={
            "temperature": config["temperature"],
            "max_tokens": config["max_tokens"],
        },
    )


def _is_auth_error(error: Exception) -> bool:
    """Check if error is authentication-related."""
    error_str = str(error).lower()
    auth_keywords = [
        "authentication",
        "unauthorized",
        "invalid credentials",
        "api key",
        "access key",
        "token",
        "permission denied",
        "403",
        "401",
    ]
    return any(keyword in error_str for keyword in auth_keywords)


def _is_access_error(error: Exception) -> bool:
    """Check if error is access/permission-related."""
    error_str = str(error).lower()
    access_keywords = [
        "access denied",
        "forbidden",
        "not authorized",
        "insufficient permissions",
        "quota exceeded",
        "rate limit",
        "service unavailable",
        "region not supported",
    ]
    return any(keyword in error_str for keyword in access_keywords)


def _get_helpful_error_message(provider: str, error: Exception) -> str:
    """Generate helpful error message based on provider and error type."""
    base_error = str(error)

    if provider == "anthropic":
        if _is_auth_error(error):
            return (
                f"Anthropic authentication failed: {base_error}\n"
                "Solutions:\n"
                "  1. Set ANTHROPIC_API_KEY environment variable\n"
                "  2. Check if your API key is valid and active\n"
                "  3. Try running: export ANTHROPIC_API_KEY='your-key-here'\n"
                "  4. Or switch to Bedrock: sre-agent --provider bedrock"
            )
        elif _is_access_error(error):
            return (
                f"Anthropic access denied: {base_error}\n"
                "Solutions:\n"
                "  1. Check if your account has sufficient credits\n"
                "  2. Verify your API key has the required permissions\n"
                "  3. Check rate limits and usage quotas\n"
                "  4. Or switch to Bedrock: sre-agent --provider bedrock"
            )
        else:
            return (
                f"Anthropic provider error: {base_error}\n"
                "Solutions:\n"
                "  1. Check your internet connection\n"
                "  2. Verify Anthropic service status\n"
                "  3. Try again in a few minutes\n"
                "  4. Or switch to Bedrock: sre-agent --provider bedrock"
            )

    else:  # bedrock
        if _is_auth_error(error):
            return (
                f"Amazon Bedrock authentication failed: {base_error}\n"
                "Solutions:\n"
                "  1. Configure AWS credentials (aws configure)\n"
                "  2. Set AWS_PROFILE environment variable\n"
                "  3. Check IAM permissions for Bedrock access\n"
                "  4. Verify your AWS credentials are valid\n"
                "  5. Or switch to Anthropic: sre-agent --provider anthropic"
            )
        elif _is_access_error(error):
            return (
                f"Amazon Bedrock access denied: {base_error}\n"
                "Solutions:\n"
                "  1. Enable Claude models in Bedrock console\n"
                "  2. Request model access for your AWS account\n"
                "  3. Check if the region supports Bedrock\n"
                "  4. Verify IAM permissions for bedrock:InvokeModel\n"
                "  5. Or switch to Anthropic: sre-agent --provider anthropic"
            )
        else:
            return (
                f"Amazon Bedrock provider error: {base_error}\n"
                "Solutions:\n"
                "  1. Check AWS service status\n"
                "  2. Verify the region supports Bedrock\n"
                "  3. Try a different AWS region\n"
                "  4. Check your internet connection\n"
                "  5. Or switch to Anthropic: sre-agent --provider anthropic"
            )


def validate_provider_access(provider: str = "bedrock", **kwargs) -> bool:
    """Validate if the specified provider is accessible.

    Args:
        provider: LLM provider to validate
        **kwargs: Additional configuration

    Returns:
        True if provider is accessible, False otherwise
    """
    try:
        llm = create_llm_with_error_handling(provider, **kwargs)
        # Try a simple test call to validate access
        # Note: This is a minimal validation - actual usage may still fail
        logger.info(f"Provider {provider} validation successful")
        return True
    except Exception as e:
        logger.warning(f"Provider {provider} validation failed: {e}")
        return False


def get_recommended_provider() -> str:
    """Get recommended provider based on availability.

    Returns:
        Recommended provider name
    """
    # Try bedrock first (default), then anthropic
    for provider in ["bedrock", "anthropic"]:
        if validate_provider_access(provider):
            logger.info(f"Recommended provider: {provider}")
            return provider

    logger.warning("No providers are immediately accessible - defaulting to bedrock")
    return "bedrock"
fix(02-use-cases): SRE-Agent Deployment (#179) * Add missing credential_provider_name parameter to config.yaml.example * Fix get_config function to properly parse YAML values with inline comments * Enhanced get_config to prevent copy-paste whitespace errors in AWS identifiers * Improve LLM provider configuration and error handling with bedrock as default * Add OpenAPI templating system and fix hardcoded regions * Add backend template build to Readme * delete old yaml files * Fix Cognito setup with automation script and missing domain creation steps * docs: Add EC2 instance port configuration documentation - Document required inbound ports (443, 8011-8014) - Include SSL/TLS security requirements - Add AWS security group best practices - Provide port usage summary table * docs: Add hyperlinks to prerequisites in README - Link EC2 port configuration documentation - Link IAM role authentication setup - Improve navigation to detailed setup instructions * docs: Add BACKEND_API_KEY to configuration documentation - Document gateway environment variables section - Add BACKEND_API_KEY requirement for credential provider - Include example .env file format for gateway directory - Explain usage in create_gateway.sh script * docs: Add BACKEND_API_KEY to deployment guide environment variables - Include BACKEND_API_KEY in environment variables reference table - Mark as required for gateway setup - Provide quick reference alongside other required variables * docs: Add BedrockAgentCoreFullAccess policy and trust policy documentation - Document AWS managed policy BedrockAgentCoreFullAccess - Add trust policy requirements for bedrock-agentcore.amazonaws.com - Reorganize IAM permissions for better clarity - Remove duplicate trust policy section - Add IAM role requirement to deployment prerequisites * docs: Document role_name field in gateway config example - Explain that role_name is used to create and manage the gateway - Specify BedrockAgentCoreFullAccess policy requirement - Note trust policy requirement for bedrock-agentcore.amazonaws.com - Improve clarity for gateway configuration setup * docs: Add AWS IP address ranges for production security enhancement - Document AWS IP ranges JSON download for restricting access - Reference official AWS documentation for IP address ranges - Provide security alternatives to 0.0.0.0/0 for production - Include examples of restricted security group configurations - Enable egress filtering and region-specific access control * style: Format Python code with black - Reformat 14 Python files for consistent code style - Apply PEP 8 formatting standards - Improve code readability and maintainability * docs: Update SRE agent prerequisites and setup documentation - Convert prerequisites section to markdown table format - Add SSL certificate provider examples (no-ip.com, letsencrypt.org) - Add Identity Provider (IDP) requirement with setup_cognito.sh reference - Clarify that all prerequisites must be completed before setup - Add reference to domain name and cert paths needed for BACKEND_DOMAIN - Remove Managing OpenAPI Specifications section (covered in use-case setup) - Add Deployment Guide link to Development to Production section Addresses issues #171 and #174 * fix: Replace 'AWS Bedrock' with 'Amazon Bedrock' in SRE agent files - Updated error messages in llm_utils.py - Updated comments in both .env.example files - Ensures consistent naming convention across SRE agent codebase --------- Co-authored-by: dheerajoruganty <dheo@amazon.com> Co-authored-by: Amit Arora <aroraai@amazon.com> 2025-08-01 13:24:58 -04:00			`#!/usr/bin/env python3`
			`"""`
			`Centralized LLM utilities with improved error handling.`

			`This module provides a single point for LLM creation with proper error handling`
			`for authentication, access, and configuration issues.`
			`"""`

			`import logging`
			`from typing import Optional, Dict, Any`

			`from langchain_anthropic import ChatAnthropic`
			`from langchain_aws import ChatBedrock`

			`from .constants import SREConstants`

			`logger = logging.getLogger(__name__)`


			`class LLMProviderError(Exception):`
			`"""Exception raised when LLM provider creation fails."""`

			`pass`


			`class LLMAuthenticationError(LLMProviderError):`
			`"""Exception raised when LLM authentication fails."""`

			`pass`


			`class LLMAccessError(LLMProviderError):`
			`"""Exception raised when LLM access is denied."""`

			`pass`


			`def create_llm_with_error_handling(provider: str = "bedrock", **kwargs):`
			`"""Create LLM instance with proper error handling and helpful error messages.`

			`Args:`
			`provider: LLM provider ("anthropic" or "bedrock")`
			`**kwargs: Additional configuration overrides`

			`Returns:`
			`LLM instance`

			`Raises:`
			`LLMProviderError: For general provider errors`
			`LLMAuthenticationError: For authentication failures`
			`LLMAccessError: For access/permission failures`
			`ValueError: For unsupported providers`
			`"""`
			`if provider not in ["anthropic", "bedrock"]:`
			`raise ValueError(`
			`f"Unsupported provider: {provider}. Use 'anthropic' or 'bedrock'"`
			`)`

			`logger.info(f"Creating LLM with provider: {provider}")`

			`try:`
			`config = SREConstants.get_model_config(provider, **kwargs)`

			`if provider == "anthropic":`
			`logger.info(f"Creating Anthropic LLM - Model: {config['model_id']}")`
			`return _create_anthropic_llm(config)`
			`else: # bedrock`
			`logger.info(`
			`f"Creating Bedrock LLM - Model: {config['model_id']}, Region: {config['region_name']}"`
			`)`
			`return _create_bedrock_llm(config)`

			`except Exception as e:`
			`error_msg = _get_helpful_error_message(provider, e)`
			`logger.error(f"Failed to create LLM: {error_msg}")`

			`# Classify the error type for better handling`
			`if _is_auth_error(e):`
			`raise LLMAuthenticationError(error_msg) from e`
			`elif _is_access_error(e):`
			`raise LLMAccessError(error_msg) from e`
			`else:`
			`raise LLMProviderError(error_msg) from e`


			`def _create_anthropic_llm(config: Dict[str, Any]):`
			`"""Create Anthropic LLM instance."""`
			`return ChatAnthropic(`
			`model=config["model_id"],`
			`max_tokens=config["max_tokens"],`
			`temperature=config["temperature"],`
			`)`


			`def _create_bedrock_llm(config: Dict[str, Any]):`
			`"""Create Bedrock LLM instance."""`
			`return ChatBedrock(`
			`model_id=config["model_id"],`
			`region_name=config["region_name"],`
			`model_kwargs={`
			`"temperature": config["temperature"],`
			`"max_tokens": config["max_tokens"],`
			`},`
			`)`


			`def _is_auth_error(error: Exception) -> bool:`
			`"""Check if error is authentication-related."""`
			`error_str = str(error).lower()`
			`auth_keywords = [`
			`"authentication",`
			`"unauthorized",`
			`"invalid credentials",`
			`"api key",`
			`"access key",`
			`"token",`
			`"permission denied",`
			`"403",`
			`"401",`
			`]`
			`return any(keyword in error_str for keyword in auth_keywords)`


			`def _is_access_error(error: Exception) -> bool:`
			`"""Check if error is access/permission-related."""`
			`error_str = str(error).lower()`
			`access_keywords = [`
			`"access denied",`
			`"forbidden",`
			`"not authorized",`
			`"insufficient permissions",`
			`"quota exceeded",`
			`"rate limit",`
			`"service unavailable",`
			`"region not supported",`
			`]`
			`return any(keyword in error_str for keyword in access_keywords)`


			`def _get_helpful_error_message(provider: str, error: Exception) -> str:`
			`"""Generate helpful error message based on provider and error type."""`
			`base_error = str(error)`

			`if provider == "anthropic":`
			`if _is_auth_error(error):`
			`return (`
			`f"Anthropic authentication failed: {base_error}\n"`
			`"Solutions:\n"`
			`" 1. Set ANTHROPIC_API_KEY environment variable\n"`
			`" 2. Check if your API key is valid and active\n"`
			`" 3. Try running: export ANTHROPIC_API_KEY='your-key-here'\n"`
			`" 4. Or switch to Bedrock: sre-agent --provider bedrock"`
			`)`
			`elif _is_access_error(error):`
			`return (`
			`f"Anthropic access denied: {base_error}\n"`
			`"Solutions:\n"`
			`" 1. Check if your account has sufficient credits\n"`
			`" 2. Verify your API key has the required permissions\n"`
			`" 3. Check rate limits and usage quotas\n"`
			`" 4. Or switch to Bedrock: sre-agent --provider bedrock"`
			`)`
			`else:`
			`return (`
			`f"Anthropic provider error: {base_error}\n"`
			`"Solutions:\n"`
			`" 1. Check your internet connection\n"`
			`" 2. Verify Anthropic service status\n"`
			`" 3. Try again in a few minutes\n"`
			`" 4. Or switch to Bedrock: sre-agent --provider bedrock"`
			`)`

			`else: # bedrock`
			`if _is_auth_error(error):`
			`return (`
			`f"Amazon Bedrock authentication failed: {base_error}\n"`
			`"Solutions:\n"`
			`" 1. Configure AWS credentials (aws configure)\n"`
			`" 2. Set AWS_PROFILE environment variable\n"`
			`" 3. Check IAM permissions for Bedrock access\n"`
			`" 4. Verify your AWS credentials are valid\n"`
			`" 5. Or switch to Anthropic: sre-agent --provider anthropic"`
			`)`
			`elif _is_access_error(error):`
			`return (`
			`f"Amazon Bedrock access denied: {base_error}\n"`
			`"Solutions:\n"`
			`" 1. Enable Claude models in Bedrock console\n"`
			`" 2. Request model access for your AWS account\n"`
			`" 3. Check if the region supports Bedrock\n"`
			`" 4. Verify IAM permissions for bedrock:InvokeModel\n"`
			`" 5. Or switch to Anthropic: sre-agent --provider anthropic"`
			`)`
			`else:`
			`return (`
			`f"Amazon Bedrock provider error: {base_error}\n"`
			`"Solutions:\n"`
			`" 1. Check AWS service status\n"`
			`" 2. Verify the region supports Bedrock\n"`
			`" 3. Try a different AWS region\n"`
			`" 4. Check your internet connection\n"`
			`" 5. Or switch to Anthropic: sre-agent --provider anthropic"`
			`)`


			`def validate_provider_access(provider: str = "bedrock", **kwargs) -> bool:`
			`"""Validate if the specified provider is accessible.`

			`Args:`
			`provider: LLM provider to validate`
			`**kwargs: Additional configuration`

			`Returns:`
			`True if provider is accessible, False otherwise`
			`"""`
			`try:`
			`llm = create_llm_with_error_handling(provider, **kwargs)`
			`# Try a simple test call to validate access`
			`# Note: This is a minimal validation - actual usage may still fail`
			`logger.info(f"Provider {provider} validation successful")`
			`return True`
			`except Exception as e:`
			`logger.warning(f"Provider {provider} validation failed: {e}")`
			`return False`


			`def get_recommended_provider() -> str:`
			`"""Get recommended provider based on availability.`

			`Returns:`
			`Recommended provider name`
			`"""`
			`# Try bedrock first (default), then anthropic`
			`for provider in ["bedrock", "anthropic"]:`
			`if validate_provider_access(provider):`
			`logger.info(f"Recommended provider: {provider}")`
			`return provider`

			`logger.warning("No providers are immediately accessible - defaulting to bedrock")`
			`return "bedrock"`