2025-07-16 14:07:30 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
"""
|
|
|
|
AgentCore Gateway Management Tool
|
|
|
|
|
|
|
|
This tool provides functionality to create and manage AWS AgentCore Gateways
|
|
|
|
with MCP protocol support and JWT authorization. It supports creating
|
|
|
|
gateways and adding OpenAPI targets from S3 or inline schemas.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
from pathlib import Path
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
from typing import Any, Dict
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
import boto3
|
|
|
|
from botocore.config import Config
|
|
|
|
from botocore.exceptions import ClientError
|
|
|
|
|
2025-07-20 20:27:10 -04:00
|
|
|
# Configuration constants
|
|
|
|
GATEWAY_DELETION_PROPAGATION_DELAY = 3
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
# Configure logging with basicConfig
|
|
|
|
logging.basicConfig(
|
|
|
|
level=logging.INFO, # Set the log level to INFO
|
|
|
|
# Define log message format
|
|
|
|
format="%(asctime)s,p%(process)s,{%(filename)s:%(lineno)d},%(levelname)s,%(message)s",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2025-07-20 20:27:10 -04:00
|
|
|
def _extract_account_id_from_arn(arn: str) -> str:
|
|
|
|
"""
|
|
|
|
Extract AWS account ID from an ARN.
|
2025-08-01 13:24:58 -04:00
|
|
|
|
2025-07-20 20:27:10 -04:00
|
|
|
Args:
|
|
|
|
arn: AWS ARN string
|
2025-08-01 13:24:58 -04:00
|
|
|
|
2025-07-20 20:27:10 -04:00
|
|
|
Returns:
|
|
|
|
Account ID extracted from ARN
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
# ARN format: arn:aws:service:region:account-id:resource
|
|
|
|
parts = arn.split(":")
|
|
|
|
if len(parts) >= 5:
|
|
|
|
return parts[4]
|
|
|
|
else:
|
|
|
|
logging.error(f"Invalid ARN format: {arn}")
|
|
|
|
return ""
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"Failed to extract account ID from ARN: {e}")
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
def _create_agentcore_client(region: str, endpoint_url: str) -> Any:
|
|
|
|
"""
|
|
|
|
Create and return an AgentCore client for interacting with the AWS service with retry configuration.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
region: AWS region name
|
|
|
|
endpoint_url: AgentCore endpoint URL
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Configured boto3 client for bedrock-agentcore-control
|
|
|
|
"""
|
2025-08-13 08:32:37 -04:00
|
|
|
# Validate that the region matches the endpoint URL
|
|
|
|
import re
|
|
|
|
endpoint_region_match = re.search(r'\.([a-z0-9-]+)\.amazonaws\.com', endpoint_url)
|
|
|
|
if endpoint_region_match:
|
|
|
|
endpoint_region = endpoint_region_match.group(1)
|
|
|
|
if endpoint_region != region:
|
|
|
|
error_msg = (
|
|
|
|
f"Region mismatch: The --region parameter '{region}' does not match "
|
|
|
|
f"the region in the endpoint URL '{endpoint_region}'. "
|
|
|
|
f"Please ensure both use the same region (e.g., --region {endpoint_region})"
|
|
|
|
)
|
|
|
|
logging.error(error_msg)
|
|
|
|
raise ValueError(error_msg)
|
|
|
|
|
2025-07-20 20:27:10 -04:00
|
|
|
# Custom retry configuration with increased attempts and timeout
|
2025-07-16 14:07:30 -04:00
|
|
|
retry_config = Config(
|
2025-08-01 13:24:58 -04:00
|
|
|
retries={"max_attempts": 20, "mode": "adaptive"},
|
2025-07-20 20:27:10 -04:00
|
|
|
connect_timeout=60,
|
2025-08-01 13:24:58 -04:00
|
|
|
read_timeout=60,
|
2025-07-16 14:07:30 -04:00
|
|
|
)
|
2025-08-01 13:24:58 -04:00
|
|
|
|
2025-07-16 14:07:30 -04:00
|
|
|
try:
|
|
|
|
client = boto3.client(
|
2025-08-01 13:24:58 -04:00
|
|
|
"bedrock-agentcore-control",
|
|
|
|
region_name=region,
|
2025-07-16 14:07:30 -04:00
|
|
|
endpoint_url=endpoint_url,
|
2025-08-01 13:24:58 -04:00
|
|
|
config=retry_config,
|
2025-07-16 14:07:30 -04:00
|
|
|
)
|
|
|
|
logging.info(f"Created AgentCore client for region {region}")
|
|
|
|
return client
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"Failed to create AgentCore client: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def _print_gateway_response(response: Dict[str, Any]) -> None:
|
|
|
|
"""
|
|
|
|
Print formatted gateway creation response details.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
response: Gateway creation response from AWS
|
|
|
|
"""
|
|
|
|
print("=" * 80)
|
|
|
|
print("GATEWAY CREATION RESPONSE")
|
|
|
|
print("=" * 80)
|
|
|
|
|
|
|
|
# Status and Basic Info
|
|
|
|
print(f"\n📊 Status: {response.get('status', 'N/A')}")
|
|
|
|
print(f"✅ HTTP Status: {response['ResponseMetadata']['HTTPStatusCode']}")
|
|
|
|
|
|
|
|
# Gateway Details
|
|
|
|
print(f"\n🔗 Gateway URL: {response.get('gatewayUrl', 'N/A')}")
|
|
|
|
print(f"📌 Gateway ID: {response.get('gatewayId', 'N/A')}")
|
|
|
|
print(f"📝 Gateway Name: {response.get('name', 'N/A')}")
|
|
|
|
print(f"💬 Description: {response.get('description', 'N/A')}")
|
|
|
|
|
|
|
|
# ARN Information
|
|
|
|
print(f"\n🏷️ Gateway ARN: {response.get('gatewayArn', 'N/A')}")
|
|
|
|
print(f"👤 Role ARN: {response.get('roleArn', 'N/A')}")
|
|
|
|
|
|
|
|
# Protocol Configuration
|
|
|
|
protocol_config = response.get("protocolConfiguration", {}).get("mcp", {})
|
|
|
|
print(f"\n🔧 Protocol Type: {response.get('protocolType', 'N/A')}")
|
|
|
|
print(
|
|
|
|
f"📋 Supported Versions: {', '.join(protocol_config.get('supportedVersions', []))}"
|
|
|
|
)
|
|
|
|
print(f"🔍 Search Type: {protocol_config.get('searchType', 'N/A')}")
|
|
|
|
|
|
|
|
# Authorizer Configuration
|
|
|
|
auth_config = response.get("authorizerConfiguration", {}).get(
|
|
|
|
"customJWTAuthorizer", {}
|
|
|
|
)
|
|
|
|
print(f"\n🔐 Authorizer Type: {response.get('authorizerType', 'N/A')}")
|
|
|
|
print(f"🌐 Discovery URL: {auth_config.get('discoveryUrl', 'N/A')}")
|
|
|
|
print(f"👥 Allowed Audience: {', '.join(auth_config.get('allowedAudience', []))}")
|
|
|
|
|
|
|
|
# Timestamps
|
|
|
|
print(f"\n📅 Created At: {response.get('createdAt', 'N/A')}")
|
|
|
|
print(f"🔄 Updated At: {response.get('updatedAt', 'N/A')}")
|
|
|
|
|
|
|
|
# Request Metadata
|
|
|
|
response_metadata = response["ResponseMetadata"]
|
|
|
|
request_id = response_metadata["RequestId"]
|
|
|
|
timestamp = response_metadata["HTTPHeaders"]["date"]
|
|
|
|
|
|
|
|
print(f"\n🆔 Request ID: {request_id}")
|
|
|
|
print(f"🕐 Timestamp: {timestamp}")
|
|
|
|
print("=" * 80)
|
|
|
|
|
|
|
|
|
|
|
|
def _save_gateway_url(gateway_url: str, output_file: str = ".gateway_uri") -> None:
|
|
|
|
"""
|
|
|
|
Save the gateway URL to a file.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
gateway_url: Gateway URL to save
|
|
|
|
output_file: Output file path
|
|
|
|
"""
|
|
|
|
# Remove trailing slash if present
|
|
|
|
gateway_url = gateway_url.rstrip("/")
|
|
|
|
|
|
|
|
# Remove '/mcp' from the end if present
|
|
|
|
if gateway_url.endswith("/mcp"):
|
|
|
|
gateway_url = gateway_url[:-4]
|
|
|
|
|
|
|
|
Path(output_file).write_text(gateway_url)
|
|
|
|
logging.info(f"Saved gateway URL to {output_file}")
|
|
|
|
|
|
|
|
|
|
|
|
def _check_gateway_exists(client: Any, gateway_name: str) -> str:
|
|
|
|
"""
|
|
|
|
Check if a gateway with the given name already exists.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_name: Name of the gateway to check
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Gateway ID if exists, empty string if not found
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
response = client.list_gateways()
|
|
|
|
gateways = response.get("items", [])
|
|
|
|
|
|
|
|
for gateway in gateways:
|
|
|
|
if gateway.get("name") == gateway_name:
|
|
|
|
gateway_id = gateway.get("gatewayId", "")
|
|
|
|
logging.info(
|
|
|
|
f"Found existing gateway: {gateway_name} (ID: {gateway_id})"
|
|
|
|
)
|
|
|
|
return gateway_id
|
|
|
|
|
|
|
|
logging.info(f"No existing gateway found with name: {gateway_name}")
|
|
|
|
return ""
|
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to list gateways: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def _delete_gateway_targets(client: Any, gateway_id: str) -> None:
|
|
|
|
"""
|
|
|
|
Delete all targets associated with a gateway.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_id: Gateway ID whose targets to delete
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
logging.info(f"Listing targets for gateway: {gateway_id}")
|
|
|
|
targets_response = client.list_gateway_targets(gatewayIdentifier=gateway_id)
|
|
|
|
targets = targets_response.get("items", [])
|
|
|
|
|
|
|
|
if not targets:
|
|
|
|
logging.info(f"No targets found for gateway: {gateway_id}")
|
|
|
|
return
|
|
|
|
|
|
|
|
logging.info(f"Found {len(targets)} targets to delete")
|
|
|
|
|
|
|
|
for target in targets:
|
|
|
|
target_id = target.get("targetId", "")
|
|
|
|
target_name = target.get("name", "Unknown")
|
|
|
|
|
|
|
|
if target_id:
|
|
|
|
logging.info(f"Deleting target: {target_name} (ID: {target_id})")
|
|
|
|
delete_response = client.delete_gateway_target(
|
|
|
|
targetId=target_id, gatewayIdentifier=gateway_id
|
|
|
|
)
|
|
|
|
logging.info(f"Target deleted successfully: {target_name}")
|
|
|
|
|
|
|
|
if logging.getLogger().isEnabledFor(logging.DEBUG):
|
|
|
|
logging.debug(f"Target delete response: {delete_response}")
|
|
|
|
else:
|
|
|
|
logging.warning(f"Target has no ID, skipping: {target_name}")
|
|
|
|
|
|
|
|
logging.info(f"All targets deleted for gateway: {gateway_id}")
|
|
|
|
|
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to delete targets for gateway {gateway_id}: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def _delete_gateway(client: Any, gateway_id: str) -> None:
|
|
|
|
"""
|
|
|
|
Delete a gateway by ID, including all its targets.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_id: Gateway ID to delete
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
# First delete all targets
|
|
|
|
_delete_gateway_targets(client, gateway_id)
|
|
|
|
|
|
|
|
# Then delete the gateway
|
|
|
|
logging.info(f"Deleting gateway: {gateway_id}")
|
|
|
|
delete_response = client.delete_gateway(gatewayIdentifier=gateway_id)
|
|
|
|
logging.info(f"Gateway deleted successfully: {gateway_id}")
|
|
|
|
|
|
|
|
if logging.getLogger().isEnabledFor(logging.DEBUG):
|
|
|
|
logging.debug(f"Gateway delete response: {delete_response}")
|
2025-08-01 13:24:58 -04:00
|
|
|
|
2025-07-20 20:27:10 -04:00
|
|
|
# Wait for deletion to propagate
|
2025-08-01 13:24:58 -04:00
|
|
|
logging.info(
|
|
|
|
f"Waiting {GATEWAY_DELETION_PROPAGATION_DELAY} seconds for deletion to propagate..."
|
|
|
|
)
|
2025-07-20 20:27:10 -04:00
|
|
|
time.sleep(GATEWAY_DELETION_PROPAGATION_DELAY)
|
2025-07-16 14:07:30 -04:00
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to delete gateway {gateway_id}: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def create_gateway(
|
|
|
|
client: Any,
|
|
|
|
gateway_name: str,
|
|
|
|
role_arn: str,
|
|
|
|
discovery_url: str,
|
|
|
|
allowed_audience: str = None,
|
|
|
|
allowed_clients: list = None,
|
|
|
|
description: str = "AgentCore Gateway created via SDK",
|
|
|
|
search_type: str = "SEMANTIC",
|
|
|
|
protocol_version: str = "2025-03-26",
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
Create a new AgentCore Gateway with JWT authorization.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_name: Name for the gateway
|
|
|
|
role_arn: IAM role ARN with necessary permissions
|
|
|
|
discovery_url: JWT discovery URL
|
|
|
|
allowed_audience: Allowed JWT audience (for Auth0/Okta)
|
|
|
|
allowed_clients: Allowed JWT client IDs (for Cognito)
|
|
|
|
description: Gateway description
|
|
|
|
search_type: MCP search type (default: SEMANTIC)
|
|
|
|
protocol_version: MCP protocol version (default: 2025-03-26)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Gateway creation response
|
|
|
|
"""
|
|
|
|
# Build auth config based on whether it's Cognito (clients) or Auth0/Okta (audience)
|
|
|
|
auth_config = {"customJWTAuthorizer": {"discoveryUrl": discovery_url}}
|
|
|
|
|
|
|
|
if allowed_clients:
|
|
|
|
# For Cognito - use allowedClients
|
|
|
|
auth_config["customJWTAuthorizer"]["allowedClients"] = (
|
|
|
|
allowed_clients if isinstance(allowed_clients, list) else [allowed_clients]
|
|
|
|
)
|
|
|
|
elif allowed_audience:
|
|
|
|
# For Auth0/Okta - use allowedAudience
|
|
|
|
auth_config["customJWTAuthorizer"]["allowedAudience"] = [allowed_audience]
|
|
|
|
else:
|
|
|
|
raise ValueError("Either allowed_audience or allowed_clients must be specified")
|
|
|
|
|
|
|
|
protocol_configuration = {
|
|
|
|
"mcp": {"searchType": search_type, "supportedVersions": [protocol_version]}
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
response = client.create_gateway(
|
|
|
|
name=gateway_name,
|
|
|
|
roleArn=role_arn,
|
|
|
|
protocolType="MCP",
|
|
|
|
authorizerType="CUSTOM_JWT",
|
|
|
|
authorizerConfiguration=auth_config,
|
|
|
|
protocolConfiguration=protocol_configuration,
|
|
|
|
description=description,
|
2025-08-01 13:24:58 -04:00
|
|
|
exceptionLevel="DEBUG",
|
2025-07-16 14:07:30 -04:00
|
|
|
)
|
|
|
|
logging.info(f"Created gateway: {response.get('gatewayId')}")
|
|
|
|
return response
|
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to create gateway: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def create_s3_target(
|
|
|
|
client: Any,
|
|
|
|
gateway_id: str,
|
|
|
|
s3_uri: str,
|
|
|
|
provider_arn: str,
|
|
|
|
target_name_prefix: str = "open",
|
|
|
|
description: str = "S3 target for OpenAPI schema",
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
Create a gateway target from an S3 OpenAPI schema.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_id: Gateway identifier
|
|
|
|
s3_uri: S3 URI of the OpenAPI schema
|
|
|
|
provider_arn: OAuth credential provider ARN
|
|
|
|
target_name_prefix: Prefix for target name
|
|
|
|
description: Description for the target
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Target creation response
|
|
|
|
"""
|
|
|
|
s3_target_config = {"mcp": {"openApiSchema": {"s3": {"uri": s3_uri}}}}
|
|
|
|
|
|
|
|
# OAuth credential provider configuration
|
|
|
|
# credential_config = {
|
|
|
|
# "credentialProviderType": "OAUTH",
|
|
|
|
# "credentialProvider": {
|
|
|
|
# "oauthCredentialProvider": {
|
|
|
|
# "providerArn": provider_arn,
|
|
|
|
# "scopes": []
|
|
|
|
# }
|
|
|
|
# }
|
|
|
|
# }
|
|
|
|
|
|
|
|
# API key credential provider configuration
|
|
|
|
credential_config = {
|
|
|
|
"credentialProviderType": "API_KEY",
|
|
|
|
"credentialProvider": {
|
|
|
|
"apiKeyCredentialProvider": {
|
|
|
|
# "credentialPrefix": "",
|
|
|
|
"providerArn": provider_arn,
|
|
|
|
"credentialLocation": "HEADER", # QUERY_PARAMETER
|
|
|
|
"credentialParameterName": "X-API-KEY",
|
|
|
|
}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
try:
|
|
|
|
response = client.create_gateway_target(
|
|
|
|
gatewayIdentifier=gateway_id,
|
|
|
|
name=target_name_prefix,
|
|
|
|
description=description,
|
|
|
|
targetConfiguration=s3_target_config,
|
|
|
|
credentialProviderConfigurations=[credential_config],
|
|
|
|
)
|
|
|
|
logging.info(f"Created S3 target: {response.get('targetId')}")
|
|
|
|
return response
|
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to create S3 target: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def create_inline_target(
|
|
|
|
client: Any,
|
|
|
|
gateway_id: str,
|
|
|
|
openapi_schema: str,
|
|
|
|
provider_arn: str,
|
|
|
|
target_name_prefix: str = "inline",
|
|
|
|
description: str = "Inline target for OpenAPI schema",
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
Create a gateway target from an inline OpenAPI schema.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_id: Gateway identifier
|
|
|
|
openapi_schema: Inline OpenAPI schema as string
|
|
|
|
provider_arn: OAuth credential provider ARN
|
|
|
|
target_name_prefix: Prefix for target name
|
|
|
|
description: Description for the target
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Target creation response
|
|
|
|
"""
|
|
|
|
openapi_target_config = {
|
|
|
|
"mcp": {"openApiSchema": {"inlinePayload": openapi_schema}}
|
|
|
|
}
|
|
|
|
|
|
|
|
credential_config = {
|
|
|
|
"credentialProviderType": "OAUTH",
|
|
|
|
"credentialProvider": {
|
|
|
|
"oauthCredentialProvider": {"providerArn": provider_arn, "scopes": []}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
response = client.create_gateway_target(
|
|
|
|
gatewayIdentifier=gateway_id,
|
|
|
|
name=target_name_prefix,
|
|
|
|
description=description,
|
|
|
|
targetConfiguration=openapi_target_config,
|
|
|
|
credentialProviderConfigurations=[credential_config],
|
|
|
|
)
|
|
|
|
logging.info(f"Created inline target: {response.get('targetId')}")
|
|
|
|
return response
|
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to create inline target: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def verify_gateway(client: Any, gateway_id: str) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
Verify gateway creation by fetching its details.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_id: Gateway identifier
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Gateway details
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
response = client.get_gateway(gatewayIdentifier=gateway_id)
|
|
|
|
logging.info(
|
|
|
|
f"Verified gateway: {gateway_id}, Status: {response.get('status')}"
|
|
|
|
)
|
|
|
|
return response
|
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to verify gateway: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def list_gateway_targets(client: Any, gateway_id: str) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
List all targets for a gateway.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
client: AgentCore client
|
|
|
|
gateway_id: Gateway identifier
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of gateway targets
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
response = client.list_gateway_targets(gatewayIdentifier=gateway_id)
|
|
|
|
logging.info(
|
|
|
|
f"Found {len(response.get('items', []))} targets for gateway {gateway_id}"
|
|
|
|
)
|
|
|
|
return response
|
|
|
|
except ClientError as e:
|
|
|
|
logging.error(f"Failed to list gateway targets: {e}")
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
"""Main function to orchestrate gateway creation and management."""
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Create and manage AWS AgentCore Gateways with MCP protocol support"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Required arguments
|
|
|
|
parser.add_argument("gateway_name", help="Name for the AgentCore Gateway")
|
|
|
|
|
|
|
|
# AWS configuration
|
|
|
|
parser.add_argument(
|
|
|
|
"--region", default="us-east-1", help="AWS region (default: us-east-1)"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--endpoint-url",
|
|
|
|
default="https://bedrock-agentcore-control.us-east-1.amazonaws.com",
|
|
|
|
help="AgentCore endpoint URL",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--role-arn", required=True, help="IAM Role ARN with gateway permissions"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Authorization configuration
|
|
|
|
parser.add_argument(
|
|
|
|
"--discovery-url", required=True, help="JWT discovery URL for authorization"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--allowed-audience",
|
|
|
|
default="MCPGateway",
|
|
|
|
help="Allowed JWT audience (default: MCPGateway)",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--allowed-clients", help="Allowed JWT client IDs (for Cognito)"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Gateway configuration
|
|
|
|
parser.add_argument(
|
|
|
|
"--description-for-gateway",
|
|
|
|
default="AgentCore Gateway created via SDK",
|
|
|
|
help="Gateway description",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--description-for-target",
|
|
|
|
action="append",
|
|
|
|
help="Target description (can be specified multiple times)",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--search-type", default="SEMANTIC", help="MCP search type (default: SEMANTIC)"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--protocol-version",
|
|
|
|
default="2025-03-26",
|
|
|
|
help="MCP protocol version (default: 2025-03-26)",
|
|
|
|
)
|
|
|
|
|
|
|
|
# Target configuration
|
|
|
|
parser.add_argument(
|
|
|
|
"--create-s3-target", action="store_true", help="Create an S3 OpenAPI target"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--s3-uri",
|
|
|
|
action="append",
|
|
|
|
help="S3 URI for OpenAPI schema (can be specified multiple times)",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--create-inline-target",
|
|
|
|
action="store_true",
|
|
|
|
help="Create an inline OpenAPI target",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--openapi-schema-file", help="File containing OpenAPI schema for inline target"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--provider-arn", help="OAuth credential provider ARN for targets"
|
|
|
|
)
|
|
|
|
|
|
|
|
# Output options
|
|
|
|
parser.add_argument(
|
|
|
|
"--save-gateway-url",
|
|
|
|
action="store_true",
|
|
|
|
help="Save gateway URL to .gateway_uri file",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--delete-gateway-if-exists",
|
|
|
|
action="store_true",
|
|
|
|
help="Delete gateway if it already exists before creating new one",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--output-json", action="store_true", help="Output responses in JSON format"
|
|
|
|
)
|
2025-07-20 20:27:10 -04:00
|
|
|
parser.add_argument(
|
|
|
|
"--enable-observability",
|
|
|
|
action="store_true",
|
|
|
|
help="Enable CloudWatch logs and X-Ray tracing for the gateway",
|
|
|
|
)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# Create AgentCore client
|
|
|
|
client = _create_agentcore_client(args.region, args.endpoint_url)
|
|
|
|
|
|
|
|
# Check if gateway already exists and handle deletion if requested
|
|
|
|
existing_gateway_id = _check_gateway_exists(client, args.gateway_name)
|
|
|
|
if existing_gateway_id:
|
|
|
|
if args.delete_gateway_if_exists:
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
logging.info("Deleting existing gateway before creating new one")
|
2025-07-16 14:07:30 -04:00
|
|
|
_delete_gateway(client, existing_gateway_id)
|
|
|
|
else:
|
|
|
|
logging.warning(
|
|
|
|
f"Gateway '{args.gateway_name}' already exists (ID: {existing_gateway_id})"
|
|
|
|
)
|
|
|
|
logging.warning(
|
|
|
|
"Use --delete-gateway-if-exists to delete it before creating a new one"
|
|
|
|
)
|
|
|
|
print(f"❌ Gateway '{args.gateway_name}' already exists")
|
|
|
|
print(f" Gateway ID: {existing_gateway_id}")
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
print(" Use --delete-gateway-if-exists flag to delete and recreate")
|
2025-07-16 14:07:30 -04:00
|
|
|
exit(1)
|
|
|
|
|
|
|
|
# Create gateway
|
|
|
|
logging.info(f"Creating gateway: {args.gateway_name}")
|
|
|
|
create_response = create_gateway(
|
|
|
|
client=client,
|
|
|
|
gateway_name=args.gateway_name,
|
|
|
|
role_arn=args.role_arn,
|
|
|
|
discovery_url=args.discovery_url,
|
|
|
|
allowed_audience=args.allowed_audience if not args.allowed_clients else None,
|
|
|
|
allowed_clients=(
|
|
|
|
args.allowed_clients.split(",") if args.allowed_clients else None
|
|
|
|
),
|
|
|
|
description=args.description_for_gateway,
|
|
|
|
search_type=args.search_type,
|
|
|
|
protocol_version=args.protocol_version,
|
|
|
|
)
|
|
|
|
|
|
|
|
if args.output_json:
|
|
|
|
print(json.dumps(create_response, indent=2, default=str))
|
|
|
|
else:
|
|
|
|
_print_gateway_response(create_response)
|
|
|
|
|
|
|
|
gateway_id = create_response["gatewayId"]
|
|
|
|
gateway_url = create_response.get("gatewayUrl", "")
|
2025-07-20 20:27:10 -04:00
|
|
|
gateway_arn = create_response.get("gatewayArn", "")
|
|
|
|
|
|
|
|
# Check if observability was requested
|
|
|
|
if args.enable_observability:
|
|
|
|
logging.error("Observability feature is not yet supported")
|
2025-08-01 13:24:58 -04:00
|
|
|
print(
|
|
|
|
"\n❌ Error: The --enable-observability feature is currently not supported but will be available soon."
|
|
|
|
)
|
2025-07-20 20:27:10 -04:00
|
|
|
print(" Please run the command without the --enable-observability flag.")
|
|
|
|
exit(1)
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
# Save gateway URL if requested
|
|
|
|
if args.save_gateway_url and gateway_url:
|
|
|
|
_save_gateway_url(gateway_url)
|
|
|
|
|
|
|
|
# Verify gateway creation
|
|
|
|
verify_response = verify_gateway(client, gateway_id)
|
|
|
|
if args.output_json:
|
|
|
|
print("\nGateway Verification:")
|
|
|
|
print(json.dumps(verify_response, indent=2, default=str))
|
|
|
|
|
|
|
|
# Create S3 targets if requested
|
|
|
|
if args.create_s3_target:
|
|
|
|
if not args.provider_arn:
|
|
|
|
logging.error("Provider ARN required for creating targets")
|
|
|
|
parser.error("--provider-arn is required when creating targets")
|
|
|
|
|
|
|
|
if not args.s3_uri:
|
|
|
|
logging.error("At least one S3 URI required when creating S3 targets")
|
|
|
|
parser.error("--s3-uri is required when creating S3 targets")
|
|
|
|
|
|
|
|
# Handle multiple S3 URIs and descriptions
|
|
|
|
s3_uris = args.s3_uri
|
|
|
|
descriptions = args.description_for_target or []
|
|
|
|
|
|
|
|
# Ensure we have descriptions for all URIs (use default if not enough provided)
|
|
|
|
while len(descriptions) < len(s3_uris):
|
|
|
|
descriptions.append("S3 target for OpenAPI schema")
|
|
|
|
|
|
|
|
s3_responses = []
|
|
|
|
for i, s3_uri in enumerate(s3_uris):
|
|
|
|
# Extract a meaningful name from the S3 URI for the target
|
|
|
|
target_name = (
|
|
|
|
s3_uri.split("/")[-1].replace(".yaml", "").replace(".json", "")
|
|
|
|
)
|
|
|
|
if not target_name or target_name == s3_uri:
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
target_name = f"target-{i + 1}"
|
2025-07-16 14:07:30 -04:00
|
|
|
|
|
|
|
# Replace underscores with hyphens to meet AWS naming requirements
|
|
|
|
# AWS requires: ([0-9a-zA-Z][-]?){1,100}
|
|
|
|
target_name = target_name.replace("_", "-")
|
|
|
|
|
|
|
|
logging.info(
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
f"Creating S3 OpenAPI target {i + 1}/{len(s3_uris)}: {target_name}"
|
2025-07-16 14:07:30 -04:00
|
|
|
)
|
|
|
|
s3_response = create_s3_target(
|
|
|
|
client=client,
|
|
|
|
gateway_id=gateway_id,
|
|
|
|
s3_uri=s3_uri,
|
|
|
|
provider_arn=args.provider_arn,
|
|
|
|
target_name_prefix=target_name,
|
|
|
|
description=descriptions[i],
|
|
|
|
)
|
|
|
|
s3_responses.append(s3_response)
|
|
|
|
|
|
|
|
if args.output_json:
|
feat(02-use-cases): integrate AgentCore Memory with SRE Agent for intelligent context-aware incident response (#210)
* feat: integrate long-term memory system into SRE agent
- Add AgentCore Memory integration with three memory strategies:
* User preferences (escalation, notification, workflow preferences)
* Infrastructure knowledge (dependencies, patterns, baselines)
* Investigation summaries (timeline, actions, findings)
- Implement memory tools for save/retrieve operations
- Add automatic memory capture through hooks and pattern recognition
- Extend agent state to support memory context
- Integrate memory-aware planning in supervisor agent
- Add comprehensive test coverage for memory functionality
- Create detailed documentation with usage examples
This transforms the SRE agent from stateless to learning assistant
that becomes more valuable over time by remembering user preferences,
infrastructure patterns, and investigation outcomes.
Addresses issue #164
* feat: environment variable config, agent routing fixes, and project organization
- Move USER_ID/SESSION_ID from metadata parsing to environment variables
- Add .memory_id to .gitignore for local memory state
- Update .gitignore to use .scratchpad/ folder instead of .scratchpad.md
- Fix agent routing issues with supervisor prompt and graph node naming
- Add conversation memory tracking for all agents and supervisor
- Improve agent metadata system with centralized constants
- Add comprehensive logging and debugging for agent tool access
- Update deployment script to pass user_id/session_id in payload
- Create .scratchpad/ folder structure for better project organization
* feat: enhance SRE agent with automatic report archiving and error fixes
- Add automatic archiving system for reports by date
- Include user_id in report filenames for better organization
- Fix Pydantic validation error with string-to-list conversion for investigation steps
- Add content length truncation for memory storage to prevent validation errors
- Remove status line from report output for cleaner formatting
- Implement date-based folder organization (YYYY-MM-DD format)
- Add memory content length limits configuration in constants
Key improvements:
- Reports now auto-archive old files when saving new ones
- User-specific filenames: query_user_id_UserName_YYYYMMDD_HHMMSS.md
- Robust error handling for memory content length limits
- Backward compatibility with existing filename formats
* feat: fix memory retrieval system for cross-session searches and user personalization
Key fixes and improvements:
- Fix case preservation in actor_id sanitization (Carol remains Carol, not carol)
- Enable cross-session memory searches for infrastructure and investigation memories
- Add XML parsing support for investigation summaries stored in XML format
- Enhance user preference integration throughout the system
- Add comprehensive debug logging for memory retrieval processes
- Update prompts to support user-specific communication styles and preferences
Memory system now properly:
- Preserves user case in memory namespaces (/sre/users/Carol vs /sre/users/carol)
- Searches across all sessions for planning context vs session-specific for current state
- Parses both JSON and XML formatted investigation memories
- Adapts investigation approach based on user preferences and historical patterns
- Provides context-aware planning using infrastructure knowledge and past investigations
* feat: enhance SRE agent with user-specific memory isolation and anti-hallucination measures
Memory System Improvements:
- Fix memory isolation to retrieve only user-specific memories (Alice doesn't see Carol's data)
- Implement proper namespace handling for cross-session vs session-specific searches
- Add detailed logging for memory retrieval debugging and verification
- Remove verbose success logs, keep only error logs for cleaner output
Anti-Hallucination Enhancements:
- Add tool output validation requirements to agent prompts
- Implement timestamp fabrication prevention (use 2024-* format from backend)
- Require tool attribution for all metrics and findings in reports
- Add backend data alignment patterns for consistent data references
- Update supervisor aggregation prompts to flag unverified claims
Code Organization:
- Extract hardcoded prompts from supervisor.py to external prompt files
- Add missing session_id parameters to SaveInfrastructureTool and SaveInvestigationTool
- Improve memory client namespace documentation and cross-session search logic
- Reduce debug logging noise while maintaining error tracking
Verification Complete:
- Memory isolation working correctly (only user-specific data retrieval)
- Cross-session memory usage properly configured for planning and investigations
- Memory integration confirmed in report generation pipeline
- Anti-hallucination measures prevent fabricated metrics and timestamps
* feat: organize utility scripts in dedicated scripts folder
Script Organization:
- Move manage_memories.py to scripts/ folder with updated import paths
- Move configure_gateway.sh to scripts/ folder with corrected PROJECT_ROOT path
- Copy user_config.yaml to scripts/ folder for self-contained script usage
Path Fixes:
- Update manage_memories.py to import sre_agent module from correct relative path
- Fix .memory_id file path resolution for new script location
- Update configure_gateway.sh PROJECT_ROOT to point to correct parent directory
- Add fallback logic to find user_config.yaml in scripts/ or project root
Script Improvements:
- Update help text and examples to use 'uv run python scripts/' syntax
- Make manage_memories.py executable with proper permissions
- Maintain backward compatibility for custom config file paths
- Self-contained scripts folder with all required dependencies
Verification:
- All scripts work correctly from new location
- Memory management functions operate properly
- Gateway configuration handles paths correctly
- User preferences loading works from scripts directory
* docs: update SSL certificate paths to use /opt/ssl standard location
- Update README.md to reference /opt/ssl for SSL certificate paths
- Update docs/demo-environment.md to use /opt/ssl paths
- Clean up scripts/configure_gateway.sh SSL fallback paths
- Remove duplicate and outdated SSL path references
- Establish /opt/ssl as the standard SSL certificate location
This ensures consistent SSL certificate management across all
documentation and scripts, supporting the established /opt/ssl
directory with proper ubuntu:ubuntu ownership.
* feat: enhance memory system with infrastructure parsing fix and user personalization analysis
Infrastructure Memory Parsing Improvements:
- Fix infrastructure memory parsing to handle both JSON and plain text formats
- Convert plain text memories to structured InfrastructureKnowledge objects
- Change warning logs to debug level for normal text-to-structure conversion
- Ensure all infrastructure memories are now retrievable and usable
User Personalization Documentation:
- Add comprehensive memory system analysis comparing Alice vs Carol reports
- Create docs/examples/ folder with real investigation reports demonstrating personalization
- Document side-by-side communication differences based on user preferences
- Show how same technical incident produces different reports for different user roles
Example Reports Added:
- Alice's technical detailed investigation report (technical role preferences)
- Carol's business-focused executive summary report (executive role preferences)
- Memory system analysis with extensive side-by-side comparisons
This demonstrates the memory system's ability to:
- Maintain technical accuracy while adapting presentation style
- Apply user-specific escalation procedures and communication channels
- Build institutional knowledge about recurring infrastructure patterns
- Personalize identical technical incidents for different organizational roles
* feat: enhance memory system with automatic pattern extraction and improved logging
## Memory System Enhancements
- **Individual agent memory integration**: Every agent response now triggers automatic memory pattern extraction through on_agent_response() hooks
- **Enhanced conversation logging**: Added detailed message breakdown showing USER/ASSISTANT/TOOL message counts and tool names called
- **Fixed infrastructure extraction**: Resolved hardcoded agent name issues by using SREConstants for agent identification
- **Comprehensive memory persistence**: All agent responses and tool executions stored as conversation memory with proper session tracking
## Tool Architecture Clarification
- **Centralized memory access**: Confirmed only supervisor agent has direct access to memory tools (retrieve_memory, save_*)
- **Individual agent focus**: Individual agents have NO memory tools, only domain-specific tools (5 tools each for metrics, logs, k8s, runbooks)
- **Automatic pattern recognition**: Memory capture happens automatically through hooks, not manual tool calls by individual agents
## Documentation Updates
- **Updated memory-system.md**: Comprehensive design documentation reflecting current implementation
- **Added example analyses**: Created flight-booking-analysis.md and api-response-time-analysis.md in docs/examples/
- **Enhanced README.md**: Added memory system overview and personalized investigation examples
- **Updated .gitignore**: Now ignores entire reports/ folder instead of just .md files
## Implementation Improvements
- **Event ID tracking**: All memory operations generate and log event IDs for verification
- **Pattern extraction confirmation**: Logs confirm pattern extraction working for all agent types
- **Memory save verification**: Comprehensive logging shows successful saves across all memory types
- **Script enhancements**: manage_memories.py now handles duplicate removal and improved user management
* docs: enhance memory system documentation with planning agent memory usage examples
- Add real agent.log snippets showing planning agent retrieving and using memory context
- Document XML-structured prompts for improved Claude model interaction
- Explain JSON response format enforcement and infrastructure knowledge extraction
- Add comprehensive logging and monitoring details
- Document actor ID design for proper memory namespace isolation
- Fix ASCII flow diagram alignment for better readability
- Remove temporal framing and present features as current design facts
* docs: add AWS documentation links and clean up memory system documentation
- Add hyperlink to Amazon Bedrock AgentCore Memory main documentation
- Link to Memory Getting Started Guide for the three memory strategies
- Remove Legacy Pattern Recognition section from documentation (code remains)
- Remove Error Handling and Fallbacks section to focus on core functionality
- Keep implementation details in code while streamlining public documentation
* docs: reorganize memory-system.md to eliminate redundancies
- Merged Memory Tool Architecture and Planning sections into unified section
- Consolidated all namespace/actor_id explanations in architecture section
- Combined pattern recognition and memory capture content
- Created dedicated Agent Memory Integration section with examples
- Removed ~15-20% redundant content while improving clarity
- Improved document structure for better navigation
* style: apply ruff formatting and fix code style issues
- Applied ruff auto-formatting to all Python files
- Fixed 383 style issues automatically
- Remaining issues require manual intervention:
- 29 ruff errors (bare except, unused variables, etc.)
- 61 mypy type errors (missing annotations, implicit Optional)
- Verified memory system functionality matches documentation
- Confirmed user personalization working correctly in reports
* docs: make benefits section more succinct in memory-system.md
- Consolidated 12 bullet points into 5 focused benefits
- Removed redundant three-category structure (Users/Teams/Operations)
- Maintained all key value propositions while improving readability
- Reduced section length by ~60% while preserving essential information
* feat: add comprehensive cleanup script with memory deletion
- Added cleanup.sh script to delete all AWS resources (gateway, runtime, memory)
- Integrated memory deletion using bedrock_agentcore MemoryClient
- Added proper error handling and graceful fallbacks
- Updated execution order: servers → gateway → memory → runtime → local files
- Added memory deletion to README.md cleanup instructions
- Includes confirmation prompts and --force option for automation
* fix: preserve .env, .venv, and reports in cleanup script
- Modified cleanup script to only remove AWS-generated configuration files
- Preserved .env files for development continuity
- Preserved .venv directories to avoid reinstalling dependencies
- Preserved reports/ directory containing investigation history
- Files removed: gateway URIs, tokens, agent ARNs, memory IDs only
- Updated documentation to clarify preserved vs removed files
* fix: use correct bedrock-agentcore-control client for gateway operations
- Changed boto3 client from 'bedrock-agentcore' to 'bedrock-agentcore-control'
- Fixes 'list_gateways' method not found error during gateway deletion
- Both gateway and runtime deletion now use the correct control plane client
* docs: add memory system initialization timing guidance
- Added note that memory system takes 10-12 minutes to be ready
- Added steps to check memory status with list command after 10 minutes
- Added instruction to run update command again once memory is ready
- Provides clear workflow for memory system setup and prevents user confusion
* docs: comprehensive documentation update and cleanup
- Remove unused root .env and .env.example files (not referenced by any code)
- Update configuration.md with comprehensive config file documentation
- Add configuration overview table with setup instructions and auto-generation info
- Consolidate specialized-agents.md content into system-components.md
- Update system-components.md with complete AgentCore architecture
- Add detailed sections for AgentCore Runtime, Gateway, and Memory primitives
- Remove cli-reference.md (excessive documentation for limited use)
- Update README.md to reference configuration guide in setup section
- Clean up documentation links and organization
The documentation now provides a clear, consolidated view of the system
architecture and configuration with proper cross-references and setup guidance.
* feat: improve runtime deployment and invocation robustness
- Increase deletion wait time to 150s for agent runtime cleanup
- Add retry logic with exponential backoff for MCP rate limiting (429 errors)
- Add session_id and user_id to agent state for memory retrieval
- Filter out /ping endpoint logs to reduce noise
- Increase boto3 read timeout to 5 minutes for long-running operations
- Add clear error messages for agent name conflicts
- Update README to clarify virtual environment requirement for scripts
- Fix session ID generation to meet 33+ character requirement
These changes improve reliability when deploying and invoking agents,
especially under heavy load or with complex queries that take time.
* chore: remove accidentally committed reports folder
Removed 130+ markdown report files from the reports/ directory that were
accidentally committed. The .gitignore already includes reports/ to prevent
future commits of these generated files.
2025-08-06 17:49:56 -04:00
|
|
|
print(f"\nS3 Target {i + 1} Creation:")
|
2025-07-16 14:07:30 -04:00
|
|
|
print(json.dumps(s3_response, indent=2, default=str))
|
|
|
|
|
|
|
|
if not args.output_json:
|
|
|
|
print(f"\n✅ Successfully created {len(s3_responses)} S3 targets")
|
|
|
|
|
|
|
|
# Create inline target if requested
|
|
|
|
if args.create_inline_target:
|
|
|
|
if not args.provider_arn:
|
|
|
|
logging.error("Provider ARN required for creating targets")
|
|
|
|
parser.error("--provider-arn is required when creating targets")
|
|
|
|
|
|
|
|
if not args.openapi_schema_file:
|
|
|
|
logging.error("OpenAPI schema file required for inline target")
|
|
|
|
parser.error("--openapi-schema-file is required for inline targets")
|
|
|
|
|
|
|
|
# Read OpenAPI schema from file
|
|
|
|
schema_content = Path(args.openapi_schema_file).read_text()
|
|
|
|
|
|
|
|
logging.info("Creating inline OpenAPI target")
|
|
|
|
inline_response = create_inline_target(
|
|
|
|
client=client,
|
|
|
|
gateway_id=gateway_id,
|
|
|
|
openapi_schema=schema_content,
|
|
|
|
provider_arn=args.provider_arn,
|
|
|
|
description=args.description_for_target,
|
|
|
|
)
|
|
|
|
|
|
|
|
if args.output_json:
|
|
|
|
print("\nInline Target Creation:")
|
|
|
|
print(json.dumps(inline_response, indent=2, default=str))
|
|
|
|
|
|
|
|
# List all targets
|
|
|
|
if args.create_s3_target or args.create_inline_target:
|
|
|
|
targets_response = list_gateway_targets(client, gateway_id)
|
|
|
|
if args.output_json:
|
|
|
|
print("\nGateway Targets:")
|
|
|
|
print(json.dumps(targets_response, indent=2, default=str))
|
|
|
|
else:
|
|
|
|
targets = targets_response.get("items", [])
|
|
|
|
print(f"\n📋 Gateway has {len(targets)} target(s):")
|
|
|
|
for target in targets:
|
|
|
|
print(
|
|
|
|
f" • {target.get('name', 'Unknown')} (ID: {target.get('targetId', 'N/A')})"
|
|
|
|
)
|
|
|
|
print(f" Description: {target.get('description', 'N/A')}")
|
|
|
|
print(f" Status: {target.get('status', 'N/A')}")
|
|
|
|
|
|
|
|
print("\n🎉 Gateway creation and configuration completed successfully!")
|
|
|
|
if gateway_url:
|
|
|
|
print(f"🔗 Gateway URL: {gateway_url}")
|
|
|
|
logging.info("Gateway creation and configuration completed successfully")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|